aboutsummaryrefslogtreecommitdiff
path: root/icing
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2020-06-05 13:55:31 -0700
committerTim Barron <tjbarron@google.com>2020-06-05 14:04:31 -0700
commita4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf (patch)
tree090955adb6f2abfc09f5275d6bab35a2c0d74198 /icing
parent79321d1f286ac650cc99fcf795a67c5dde8c0597 (diff)
downloadicing-a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf.tar.gz
Copy over changes made to Google3 codebase in Icing.
Change-Id: Ia36edb0a1b085e249dabfc220a5b72418063604f
Diffstat (limited to 'icing')
-rw-r--r--icing/file/file-backed-proto-log.h23
-rw-r--r--icing/file/file-backed-vector.h23
-rw-r--r--icing/helpers/icu/icu-data-file-helper.cc (renamed from icing/icu-data-file-helper.cc)2
-rw-r--r--icing/helpers/icu/icu-data-file-helper.h (renamed from icing/icu-data-file-helper.h)6
-rw-r--r--icing/icing-search-engine.cc174
-rw-r--r--icing/icing-search-engine.h67
-rw-r--r--icing/icing-search-engine_fuzz_test.cc2
-rw-r--r--icing/icing-search-engine_test.cc115
-rw-r--r--icing/index/index-processor_benchmark.cc16
-rw-r--r--icing/index/index-processor_test.cc14
-rw-r--r--icing/index/index.h11
-rw-r--r--icing/index/index_test.cc38
-rw-r--r--icing/index/lite-index.cc23
-rw-r--r--icing/index/lite-index.h22
-rw-r--r--icing/jni/icing-search-engine-jni.cc (renamed from icing/icing-search-engine-jni.cc)20
-rw-r--r--icing/jni/jni-cache.cc216
-rw-r--r--icing/jni/jni-cache.h78
-rw-r--r--icing/jni/jni.lds10
-rw-r--r--icing/jni/reverse-jni-break-iterator.cc187
-rw-r--r--icing/jni/reverse-jni-break-iterator.h124
-rw-r--r--icing/legacy/index/icing-dynamic-trie.cc52
-rw-r--r--icing/legacy/index/icing-dynamic-trie.h7
-rw-r--r--icing/legacy/index/icing-flash-bitmap.h3
-rw-r--r--icing/proto/document.proto5
-rw-r--r--icing/proto/document_wrapper.proto2
-rw-r--r--icing/proto/initialize.proto2
-rw-r--r--icing/proto/optimize.proto22
-rw-r--r--icing/proto/persist.proto2
-rw-r--r--icing/proto/reset.proto2
-rw-r--r--icing/proto/schema.proto2
-rw-r--r--icing/proto/scoring.proto7
-rw-r--r--icing/proto/search.proto2
-rw-r--r--icing/proto/status.proto2
-rw-r--r--icing/proto/term.proto2
-rw-r--r--icing/query/query-processor_benchmark.cc16
-rw-r--r--icing/query/query-processor_test.cc13
-rw-r--r--icing/result/result-retriever_test.cc13
-rw-r--r--icing/result/snippet-retriever.cc28
-rw-r--r--icing/result/snippet-retriever_test.cc13
-rw-r--r--icing/schema/schema-util.cc65
-rw-r--r--icing/schema/schema-util.h24
-rw-r--r--icing/schema/schema-util_test.cc34
-rw-r--r--icing/store/document-store.cc53
-rw-r--r--icing/store/document-store.h26
-rw-r--r--icing/store/document-store_test.cc48
-rw-r--r--icing/store/key-mapper.h22
-rw-r--r--icing/testing/logging-event-listener.cc121
-rw-r--r--icing/testing/logging-event-listener.h62
-rw-r--r--icing/text_classifier/lib3/utils/java/jni-base.cc44
-rw-r--r--icing/text_classifier/lib3/utils/java/jni-base.h217
-rw-r--r--icing/text_classifier/lib3/utils/java/jni-helper.cc175
-rw-r--r--icing/text_classifier/lib3/utils/java/jni-helper.h156
-rw-r--r--icing/text_classifier/lib3/utils/java/string_utils.cc73
-rw-r--r--icing/text_classifier/lib3/utils/java/string_utils.h74
-rw-r--r--icing/tokenization/icu-language-segmenter_test.cc374
-rw-r--r--icing/tokenization/icu/icu-language-segmenter-factory.cc (renamed from icing/tokenization/language-segmenter-factory.cc)17
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc (renamed from icing/tokenization/icu-language-segmenter.cc)102
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.h (renamed from icing/tokenization/icu-language-segmenter.h)6
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc1016
-rw-r--r--icing/tokenization/language-segmenter-factory.h17
-rw-r--r--icing/tokenization/language-segmenter-iterator_test.cc92
-rw-r--r--icing/tokenization/language-segmenter.h25
-rw-r--r--icing/tokenization/language-segmenter_benchmark.cc11
-rw-r--r--icing/tokenization/plain-tokenizer.cc18
-rw-r--r--icing/tokenization/plain-tokenizer_test.cc47
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc6
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc42
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc62
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc37
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc1085
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h46
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc452
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h51
-rw-r--r--icing/tokenization/simple/space-language-segmenter-factory.cc41
-rw-r--r--icing/tokenization/simple/space-language-segmenter.cc (renamed from icing/tokenization/space-language-segmenter.cc)19
-rw-r--r--icing/tokenization/simple/space-language-segmenter.h (renamed from icing/tokenization/space-language-segmenter.h)6
-rw-r--r--icing/tokenization/simple/space-language-segmenter_test.cc (renamed from icing/tokenization/space-language-segmenter_test.cc)35
-rw-r--r--icing/tokenization/tokenizer.h2
-rw-r--r--icing/transform/icu-normalizer_test.cc179
-rw-r--r--icing/transform/icu/icu-normalizer-factory.cc52
-rw-r--r--icing/transform/icu/icu-normalizer.cc (renamed from icing/transform/icu-normalizer.cc)72
-rw-r--r--icing/transform/icu/icu-normalizer.h (renamed from icing/transform/icu-normalizer.h)6
-rw-r--r--icing/transform/icu/icu-normalizer_benchmark.cc (renamed from icing/transform/icu-normalizer_benchmark.cc)17
-rw-r--r--icing/transform/icu/icu-normalizer_test.cc237
-rw-r--r--icing/transform/map/map-normalizer-factory.cc48
-rw-r--r--icing/transform/map/map-normalizer.cc86
-rw-r--r--icing/transform/map/map-normalizer.h50
-rw-r--r--icing/transform/map/map-normalizer_benchmark.cc149
-rw-r--r--icing/transform/map/map-normalizer_test.cc205
-rw-r--r--icing/transform/map/normalization-map.cc712
-rw-r--r--icing/transform/map/normalization-map.h (renamed from icing/util/icu-i18n-utils_test.cc)29
-rw-r--r--icing/transform/normalizer-factory.h22
-rw-r--r--icing/transform/normalizer.h3
-rw-r--r--icing/transform/simple/none-normalizer-factory.cc53
-rw-r--r--icing/transform/simple/none-normalizer.h (renamed from icing/transform/none-normalizer.h)8
-rw-r--r--icing/transform/simple/none-normalizer_test.cc (renamed from icing/transform/none-normalizer_test.cc)33
-rw-r--r--icing/util/document-validator.cc14
-rw-r--r--icing/util/i18n-utils.cc124
-rw-r--r--icing/util/i18n-utils.h66
-rw-r--r--icing/util/i18n-utils_test.cc141
-rw-r--r--icing/util/icu-i18n-utils.cc176
-rw-r--r--icing/util/icu-i18n-utils.h105
102 files changed, 7414 insertions, 1342 deletions
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 0b36e18..62943b8 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -210,13 +210,23 @@ class FileBackedProtoLog {
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
- // Calculates and returns the disk usage in bytes.
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
//
// Returns:
// Disk usage on success
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+ // Returns the file size of all the elements held in the log. File size is in
+ // bytes. This excludes the size of any internal metadata of the log, e.g. the
+ // log's header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
// An iterator helping to find offsets of all the protos in file.
// Example usage:
//
@@ -736,6 +746,17 @@ libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
}
template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
+ int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (total_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get file size of elments in the proto log");
+ }
+ return total_file_size - sizeof(Header);
+}
+
+template <typename ProtoT>
FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
const std::string& file_path,
int64_t initial_offset)
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index f13b67b..27d03b2 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -194,13 +194,23 @@ class FileBackedVector {
// INTERNAL on I/O error
libtextclassifier3::Status PersistToDisk();
- // Calculates and returns the disk usage in bytes.
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
//
// Returns:
// Disk usage on success
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+ // Returns the file size of the all the elements held in the vector. File size
+ // is in bytes. This excludes the size of any internal metadata of the vector,
+ // e.g. the vector's header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
// Accessors.
const T* array() const {
return reinterpret_cast<const T*>(mmapped_file_->region());
@@ -705,6 +715,17 @@ libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetDiskUsage()
return size;
}
+template <typename T>
+libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
+ const {
+ int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (total_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get file size of elements in the file-backed vector");
+ }
+ return total_file_size - sizeof(Header);
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/icu-data-file-helper.cc b/icing/helpers/icu/icu-data-file-helper.cc
index 9741dbb..5cf6a1d 100644
--- a/icing/icu-data-file-helper.cc
+++ b/icing/helpers/icu/icu-data-file-helper.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include <sys/mman.h>
diff --git a/icing/icu-data-file-helper.h b/icing/helpers/icu/icu-data-file-helper.h
index e92491d..90f5bc7 100644
--- a/icing/icu-data-file-helper.h
+++ b/icing/helpers/icu/icu-data-file-helper.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_ICU_DATA_FILE_HELPER
-#define ICING_ICU_DATA_FILE_HELPER
+#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile(
} // namespace lib
} // namespace icing
-#endif // ICING_ICU_DATA_FILE_HELPER
+#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 6dc535c..01a2922 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -194,19 +194,22 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
} // namespace
-IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options)
+IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const JniCache> jni_cache)
: IcingSearchEngine(options, std::make_unique<Filesystem>(),
- std::make_unique<Clock>()) {}
+ std::make_unique<Clock>(), std::move(jni_cache)) {}
IcingSearchEngine::IcingSearchEngine(
IcingSearchEngineOptions options,
- std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock)
+ std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock,
+ std::unique_ptr<const JniCache> jni_cache)
: options_(std::move(options)),
filesystem_(std::move(filesystem)),
icing_filesystem_(std::make_unique<IcingFilesystem>()),
clock_(std::move(clock)),
result_state_manager_(performance_configuration_.max_num_hits_per_query,
- performance_configuration_.max_num_cache_results) {
+ performance_configuration_.max_num_cache_results),
+ jni_cache_(std::move(jni_cache)) {
ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
}
@@ -220,23 +223,25 @@ IcingSearchEngine::~IcingSearchEngine() {
}
InitializeResultProto IcingSearchEngine::Initialize() {
+ // This method does both read and write so we need a writer lock. Using two
+ // locks (reader and writer) has the chance to be interrupted during
+ // switching.
+ absl_ports::unique_lock l(&mutex_);
+ return InternalInitialize();
+}
+
+InitializeResultProto IcingSearchEngine::InternalInitialize() {
ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
<< options_.base_dir();
InitializeResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
-
if (initialized_) {
// Already initialized.
result_status->set_code(StatusProto::OK);
return result_proto;
}
- // This method does both read and write so we need a writer lock. Using two
- // locks (reader and writer) has the chance to be interrupted during
- // switching.
- absl_ports::unique_lock l(&mutex_);
-
// Releases result / query cache if any
result_state_manager_.InvalidateAllResultStates();
@@ -269,14 +274,14 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers() {
ICING_RETURN_IF_ERROR(InitializeSchemaStore());
ICING_RETURN_IF_ERROR(InitializeDocumentStore());
- TC3_ASSIGN_OR_RETURN(
- language_segmenter_,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ // TODO(b/156383798) : Resolve how to specify the locale.
+ language_segmenter_factory::SegmenterOptions segmenter_options(
+ ULOC_US, jni_cache_.get());
+ TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create(
+ std::move(segmenter_options)));
- TC3_ASSIGN_OR_RETURN(
- normalizer_,
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- options_.max_token_length()));
+ TC3_ASSIGN_OR_RETURN(normalizer_,
+ normalizer_factory::Create(options_.max_token_length()));
ICING_RETURN_IF_ERROR(InitializeIndex());
@@ -416,14 +421,19 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
SetSchemaResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
libtextclassifier3::Status status = SchemaUtil::Validate(new_schema);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
- absl_ports::unique_lock l(&mutex_);
-
auto lost_previous_schema_or = LostPreviousSchema();
if (!lost_previous_schema_or.ok()) {
TransformStatus(lost_previous_schema_or.status(), result_status);
@@ -498,6 +508,11 @@ GetSchemaResultProto IcingSearchEngine::GetSchema() {
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto schema_or = schema_store_->GetSchema();
if (!schema_or.ok()) {
@@ -516,6 +531,11 @@ GetSchemaTypeResultProto IcingSearchEngine::GetSchemaType(
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type);
if (!type_config_or.ok()) {
@@ -542,6 +562,11 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
// the schema file to validate, and the schema could be changed in
// SetSchema() which is protected by the same mutex.
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto document_id_or = document_store_->Put(document);
if (!document_id_or.ok()) {
@@ -576,6 +601,11 @@ GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto document_or = document_store_->Get(name_space, uri);
if (!document_or.ok()) {
@@ -596,6 +626,11 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
// TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
@@ -616,14 +651,20 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
const std::string_view name_space) {
ICING_VLOG(1) << "Deleting namespace from doc store";
+ DeleteByNamespaceResultProto delete_result;
+ StatusProto* result_status = delete_result.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return delete_result;
+ }
// TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status =
document_store_->DeleteByNamespace(name_space);
- DeleteByNamespaceResultProto delete_result;
- TransformStatus(status, delete_result.mutable_status());
+ TransformStatus(status, result_status);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete Namespace: " << name_space;
@@ -636,14 +677,20 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
const std::string_view schema_type) {
ICING_VLOG(1) << "Deleting type from doc store";
+ DeleteBySchemaTypeResultProto delete_result;
+ StatusProto* result_status = delete_result.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return delete_result;
+ }
// TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status =
document_store_->DeleteBySchemaType(schema_type);
- DeleteBySchemaTypeResultProto delete_result;
- TransformStatus(status, delete_result.mutable_status());
+ TransformStatus(status, result_status);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete SchemaType: " << schema_type;
@@ -659,6 +706,11 @@ PersistToDiskResultProto IcingSearchEngine::PersistToDisk() {
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
auto status = InternalPersistToDisk();
TransformStatus(status, result_status);
@@ -678,6 +730,11 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
// Releases result / query cache if any
result_state_manager_.InvalidateAllResultStates();
@@ -729,6 +786,54 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
return result_proto;
}
+GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
+ ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine";
+
+ GetOptimizeInfoResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
+ // Get stats from DocumentStore
+ auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
+ if (!doc_store_optimize_info_or.ok()) {
+ TransformStatus(doc_store_optimize_info_or.status(), result_status);
+ return result_proto;
+ }
+ DocumentStore::OptimizeInfo doc_store_optimize_info =
+ doc_store_optimize_info_or.ValueOrDie();
+ result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs);
+
+ if (doc_store_optimize_info.optimizable_docs == 0) {
+ // Can return early since there's nothing to calculate on the index side
+ result_proto.set_estimated_optimizable_bytes(0);
+ result_status->set_code(StatusProto::OK);
+ return result_proto;
+ }
+
+ // Get stats from Index.
+ auto index_elements_size_or = index_->GetElementsSize();
+ if (!index_elements_size_or.ok()) {
+ TransformStatus(index_elements_size_or.status(), result_status);
+ return result_proto;
+ }
+ int64_t index_elements_size = index_elements_size_or.ValueOrDie();
+
+ // Sum up the optimizable sizes from DocumentStore and Index
+ result_proto.set_estimated_optimizable_bytes(
+ index_elements_size * doc_store_optimize_info.optimizable_docs /
+ doc_store_optimize_info.total_docs +
+ doc_store_optimize_info.estimated_optimizable_bytes);
+
+ result_status->set_code(StatusProto::OK);
+ return result_proto;
+}
+
libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk() {
ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
ICING_RETURN_IF_ERROR(document_store_->PersistToDisk());
@@ -808,6 +913,13 @@ SearchResultProto IcingSearchEngine::Search(
const ResultSpecProto& result_spec) {
SearchResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ // TODO(b/146008613) Explore ideas to make this function read-only.
+ absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
libtextclassifier3::Status status = ValidateResultSpec(result_spec);
if (!status.ok()) {
@@ -820,9 +932,6 @@ SearchResultProto IcingSearchEngine::Search(
return result_proto;
}
- // TODO(b/146008613) Explore ideas to make this function read-only.
- absl_ports::unique_lock l(&mutex_);
-
// Gets unordered results from query processor
auto query_processor_or = QueryProcessor::Create(
index_.get(), language_segmenter_.get(), normalizer_.get(),
@@ -917,6 +1026,11 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
// ResultStateManager has its own writer lock, so here we only need a reader
// lock for other components.
absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
libtextclassifier3::StatusOr<PageResultState> page_result_state_or =
result_state_manager_.GetNextPage(next_page_token);
@@ -969,6 +1083,11 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
}
void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
+ absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
+ return;
+ }
result_state_manager_.InvalidateResultState(next_page_token);
}
@@ -1138,8 +1257,9 @@ ResetResultProto IcingSearchEngine::Reset() {
return result_proto;
}
+ absl_ports::unique_lock l(&mutex_);
initialized_ = false;
- if (Initialize().status().code() != StatusProto::OK) {
+ if (InternalInitialize().status().code() != StatusProto::OK) {
// We shouldn't hit the following Initialize errors:
// NOT_FOUND: all data was cleared, we aren't expecting anything
// DATA_LOSS: all data was cleared, we aren't expecting anything
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 196f243..746b5b4 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -20,6 +20,7 @@
#include <string>
#include <string_view>
+#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/mutex.h"
@@ -60,7 +61,12 @@ class IcingSearchEngine {
uint32_t checksum;
};
- explicit IcingSearchEngine(const IcingSearchEngineOptions& options);
+ // Note: It is only required to provide a pointer to a valid instance of
+ // JniCache if this instance needs to perform reverse-jni calls. Users on
+ // Linux and iOS should always provide a nullptr.
+ explicit IcingSearchEngine(
+ const IcingSearchEngineOptions& options,
+ std::unique_ptr<const JniCache> jni_cache = nullptr);
// Calculates integrity checks and persists files to disk.
~IcingSearchEngine();
@@ -117,14 +123,17 @@ class IcingSearchEngine {
// So, callers should only have to call this if the schema changed.
// However, calling it multiple times with the same schema is a no-op.
//
- // On any error, Icing will keep using the older schema.
+ // On some errors, Icing will keep using the older schema, but on
+ // INTERNAL_ERROR, it is undefined to continue using Icing.
//
// Returns:
// OK on success
// INVALID_ARGUMENT if 'new_schema' is invalid
- // FAILED_PRECONDITION if 'new_schema' is incompatible
+ // FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
+ // has not been initialized yet.
// INTERNAL_ERROR if Icing failed to store the new schema or upgrade
- // existing data based on the new schema.
+ // existing data based on the new schema. Using Icing beyond this error is
+ // undefined and may cause crashes.
//
// TODO(cassiewang) Figure out, document (and maybe even enforce) the best
// way ordering of calls between Initialize() and SetSchema(), both when
@@ -149,6 +158,7 @@ class IcingSearchEngine {
// Returns:
// SchemaProto on success
// NOT_FOUND if a schema has not been set yet
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet.
// INTERNAL_ERROR on IO error
GetSchemaResultProto GetSchema() ICING_LOCKS_EXCLUDED(mutex_);
@@ -156,7 +166,8 @@ class IcingSearchEngine {
//
// Returns:
// SchemaTypeConfigProto on success
- // FAILED_PRECONDITION if a schema has not been set yet
+ // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
+ // has not been initialized yet.
// NOT_FOUND if there is no SchemaTypeConfig of schema_type in the
// SchemaProto
// INTERNAL_ERROR on IO error
@@ -169,7 +180,8 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
- // FAILED_PRECONDITION if a schema has not been set yet
+ // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
+ // has not been initialized yet.
// NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches
// the document's schema
// INTERNAL_ERROR on IO error
@@ -189,6 +201,7 @@ class IcingSearchEngine {
// Returns:
// The document found on success
// NOT_FOUND if the key doesn't exist or doc has been deleted
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
GetResultProto Get(std::string_view name_space, std::string_view uri);
@@ -202,6 +215,7 @@ class IcingSearchEngine {
// Returns:
// OK on success
// NOT_FOUND if no document exists with namespace, uri
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
DeleteResultProto Delete(std::string_view name_space, std::string_view uri)
ICING_LOCKS_EXCLUDED(mutex_);
@@ -216,6 +230,7 @@ class IcingSearchEngine {
// Returns:
// OK on success
// NOT_FOUND if namespace doesn't exist
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
DeleteByNamespaceResultProto DeleteByNamespace(std::string_view name_space)
ICING_LOCKS_EXCLUDED(mutex_);
@@ -230,6 +245,7 @@ class IcingSearchEngine {
// Returns:
// OK on success
// NOT_FOUND if schema type doesn't exist
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
ICING_LOCKS_EXCLUDED(mutex_);
@@ -246,6 +262,7 @@ class IcingSearchEngine {
// OK with results on success
// INVALID_ARGUMENT if any of specs is invalid
// ABORTED if failed to perform search but existing data is not affected
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on any other errors
SearchResultProto Search(const SearchSpecProto& search_spec,
const ScoringSpecProto& scoring_spec,
@@ -258,6 +275,7 @@ class IcingSearchEngine {
// Returns a SearchResultProto with status:
// OK with results on success
// ABORTED if failed to get results but existing data is not affected
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on any other errors
SearchResultProto GetNextPage(uint64_t next_page_token)
ICING_LOCKS_EXCLUDED(mutex_);
@@ -276,6 +294,7 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL on I/O error
PersistToDiskResultProto PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_);
@@ -284,25 +303,35 @@ class IcingSearchEngine {
// resource-efficient. This method purely optimizes the internal files and
// has no functional impact on what gets accepted/returned.
//
- // NOTE: This method should be called about once every 24 hours when the
- // device is idle and charging. It can also be called when the system needs
- // to free up extra disk-space.
- //
// WARNING: This method is CPU and IO intensive and depending on the
// contents stored, it can take from a few seconds to a few minutes.
// This call also blocks all read/write operations on Icing.
//
+ // SUGGESTION: Assuming the client has no restrictions on their side, it's
+ // recommended to call this method about once every 24 hours when the
+ // device is idle and charging. It can also be called when the system needs
+ // to free up extra disk-space.
+ //
// Returns:
// OK on success
// ABORTED_ERROR if optimization is aborted due to non-fatal errors before
// actual modifications are made.
// DATA_LOSS_ERROR on errors that could potentially cause data loss,
// IcingSearchEngine is still functioning.
- // INTERNAL_ERROR on any IO errors or other unrecoverable errors. Icing
- // could be in an inconsistent state and might not be usable.
+ // INTERNAL_ERROR on any IO errors or other unrecoverable errors. Continued
+ // use of Icing is undefined.
// Clients could clear and reinitialize IcingSearchEngine.
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
OptimizeResultProto Optimize() ICING_LOCKS_EXCLUDED(mutex_);
+ // Returns potential size and document savings if Optimize were called.
+ //
+ // Returns:
+ // OK on success
+ // FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet
+ // INTERNAL_ERROR on IO error
+ GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_);
+
// Clears all data from Icing and re-initializes. Clients DO NOT need to call
// Initialize again.
//
@@ -319,13 +348,14 @@ class IcingSearchEngine {
protected:
IcingSearchEngine(IcingSearchEngineOptions options,
std::unique_ptr<const Filesystem> filesystem,
- std::unique_ptr<Clock> clock);
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<const JniCache> jni_cache = nullptr);
private:
const IcingSearchEngineOptions options_;
const std::unique_ptr<const Filesystem> filesystem_;
const std::unique_ptr<const IcingFilesystem> icing_filesystem_;
- bool initialized_ = false;
+ bool initialized_ ICING_GUARDED_BY(mutex_) = false;
// Abstraction for accessing time values.
std::unique_ptr<Clock> clock_;
@@ -355,6 +385,9 @@ class IcingSearchEngine {
// Storage for all hits of content from the document store.
std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_);
+ // Pointer to JNI class references
+ const std::unique_ptr<const JniCache> jni_cache_;
+
// Helper method to do the actual work to persist data to disk. We need this
// separate method so that other public methods don't need to call
// PersistToDisk(). Public methods calling each other may cause deadlock
@@ -362,6 +395,12 @@ class IcingSearchEngine {
libtextclassifier3::Status InternalPersistToDisk()
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ // Helper method to the actual work to Initialize. We need this separate
+ // method so that other public methods don't need to call Initialize(). Public
+ // methods calling each other may cause deadlock issues.
+ InitializeResultProto InternalInitialize()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
// Helper method to initialize member variables.
//
// Returns:
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index 632fd01..d31f836 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -18,8 +18,8 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/document-builder.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/icing-search-engine.h"
-#include "icing/icu-data-file-helper.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/scoring.pb.h"
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 17795a3..baa469e 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -26,7 +26,7 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
@@ -1367,6 +1367,72 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldDeleteTemporaryDirectory) {
EXPECT_FALSE(filesystem()->FileExists(tmp_file.c_str()));
}
+TEST_F(IcingSearchEngineTest, GetOptimizeInfoHasCorrectStats) {
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(500)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(1000);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::move(fake_clock));
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // Just initialized, nothing is optimizable yet.
+ GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+
+ // Only have active documents, nothing is optimizable yet.
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+
+ // Deletes document1
+ ASSERT_THAT(icing.Delete("namespace", "uri1").status().code(),
+ Eq(StatusProto::OK));
+
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(1));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Gt(0));
+ int64_t first_estimated_optimizable_bytes =
+ optimize_info.estimated_optimizable_bytes();
+
+ // Add a second document, but it'll be expired since the time (1000) is
+ // greater than the document's creation timestamp (100) + the document's ttl
+ // (500)
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(2));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(),
+ Gt(first_estimated_optimizable_bytes));
+
+ // Optimize
+ ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
+
+ // Nothing is optimizable now that everything has been optimized away.
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+}
+
TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) {
DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
@@ -1861,7 +1927,7 @@ TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) {
document;
// Time just has to be less than the document's creation timestamp (100) + the
- // schema's ttl (500)
+ // document's ttl (500)
auto fake_clock = std::make_unique<FakeClock>();
fake_clock->SetSystemTimeMilliseconds(400);
@@ -1908,7 +1974,7 @@ TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) {
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
// Time just has to be greater than the document's creation timestamp (100) +
- // the schema's ttl (500)
+ // the document's ttl (500)
auto fake_clock = std::make_unique<FakeClock>();
fake_clock->SetSystemTimeMilliseconds(700);
@@ -3150,6 +3216,49 @@ TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) {
IsEmpty());
}
+TEST_F(IcingSearchEngineTest, UninitializedInstanceFailsSafely) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+
+ SchemaProto email_schema = CreateMessageSchema();
+ EXPECT_THAT(icing.SetSchema(email_schema).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.GetSchema().status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ icing.GetSchemaType(email_schema.types(0).schema_type()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ DocumentProto doc = CreateMessageDocument("namespace", "uri");
+ EXPECT_THAT(icing.Put(doc).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Get(doc.namespace_(), doc.uri()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Delete(doc.namespace_(), doc.uri()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.DeleteByNamespace(doc.namespace_()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.DeleteBySchemaType(email_schema.types(0).schema_type())
+ .status()
+ .code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ SearchSpecProto search_spec = SearchSpecProto::default_instance();
+ ScoringSpecProto scoring_spec = ScoringSpecProto::default_instance();
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ EXPECT_THAT(
+ icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ constexpr int kSomePageToken = 12;
+ EXPECT_THAT(icing.GetNextPage(kSomePageToken).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ icing.InvalidateNextPageToken(kSomePageToken); // Verify this doesn't crash.
+
+ EXPECT_THAT(icing.PersistToDisk().status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Optimize().status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 835478d..00d116f 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -16,7 +16,7 @@
#include "gmock/gmock.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
#include "icing/legacy/core/icing-string-util.h"
@@ -140,7 +140,7 @@ std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem,
std::unique_ptr<Normalizer> CreateNormalizer() {
return normalizer_factory::Create(
- normalizer_factory::NormalizerType::ICU4C,
+
/*max_term_byte_size=*/std::numeric_limits<int>::max())
.ValueOrDie();
}
@@ -193,8 +193,7 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -241,8 +240,7 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -290,8 +288,7 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -339,8 +336,7 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 126ea29..8dfb9c2 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -27,7 +27,7 @@
#include "icing/absl_ports/str_cat.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -91,14 +91,13 @@ class IndexProcessorTest : public Test {
ICING_ASSERT_OK_AND_ASSIGN(index_,
Index::Create(options, &icing_filesystem_));
- ICING_ASSERT_OK_AND_ASSIGN(
- lang_segmenter_,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
normalizer_,
normalizer_factory::Create(
- normalizer_factory::NormalizerType::ICU4C,
+
/*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
ICING_ASSERT_OK_AND_ASSIGN(
@@ -415,9 +414,8 @@ TEST_F(IndexProcessorTest, TooLongTokens) {
IndexProcessor::Options options;
options.max_tokens_per_document = 1000;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Normalizer> normalizer,
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
/*max_term_byte_size=*/4));
ICING_ASSERT_OK_AND_ASSIGN(
diff --git a/icing/index/index.h b/icing/index/index.h
index f287081..f30c8ad 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -113,6 +113,17 @@ class Index {
lite_index_->GetDebugInfo(verbosity, out);
}
+ // Returns the byte size of the all the elements held in the index. This
+ // excludes the size of any internal metadata of the index, e.g. the index's
+ // header.
+ //
+ // Returns:
+ // Byte size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const {
+ return lite_index_->GetElementsSize();
+ }
+
// Create an iterator to iterate through all doc hit infos in the index that
// match the term. section_id_mask can be set to ignore hits from sections not
// listed in the mask. Eg. section_id_mask = 1U << 3; would only return hits
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index ff29135..070e82a 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -45,6 +45,7 @@ namespace {
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::Gt;
using ::testing::IsEmpty;
using ::testing::IsTrue;
using ::testing::NiceMock;
@@ -621,12 +622,13 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) {
EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
/*num_to_return=*/10),
IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
- EqualsTermMetadata("foo", 1))));
+ EqualsTermMetadata("foo", 1))));
// namespace with id 1 has 1 result.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
@@ -650,7 +652,7 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
/*num_to_return=*/10),
IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
+ EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
@@ -672,9 +674,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
// Should return "fo", "foo" and "fool" across all namespaces.
EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
/*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
- EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
@@ -690,10 +692,22 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
EXPECT_THAT(edit2.AddHit("fool"), IsOk());
// 'foo' has 1 hit, 'fool' has 2 hits.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 2))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
+ /*num_to_return=*/10),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 2))));
+}
+
+TEST_F(IndexTest, GetElementsSize) {
+ // Check empty index.
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Eq(0)));
+
+ // Add an element.
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Gt(0)));
}
} // namespace
diff --git a/icing/index/lite-index.cc b/icing/index/lite-index.cc
index c9f68b5..489c53d 100644
--- a/icing/index/lite-index.cc
+++ b/icing/index/lite-index.cc
@@ -391,6 +391,29 @@ void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
lexicon_.GetDebugInfo(verbosity, out);
}
+libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
+ int64_t header_and_hit_buffer_file_size =
+ filesystem_->GetFileSize(hit_buffer_fd_.get());
+
+ if (header_and_hit_buffer_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get element size of the LiteIndex's header and hit buffer");
+ }
+
+ int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
+ if (lexicon_disk_usage == IcingFilesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get element size of LiteIndex's lexicon");
+ }
+
+ // On initialization, we grow the file to a padded size first. So this size
+ // won't count towards the size taken up by elements
+ size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
+
+ return header_and_hit_buffer_file_size - header_padded_size +
+ lexicon_disk_usage;
+}
+
uint32_t LiteIndex::Seek(uint32_t term_id) {
// Make searchable by sorting by hit buffer.
uint32_t sort_len = header_->cur_size() - header_->searchable_end();
diff --git a/icing/index/lite-index.h b/icing/index/lite-index.h
index 6d01f42..b60a947 100644
--- a/icing/index/lite-index.h
+++ b/icing/index/lite-index.h
@@ -205,6 +205,14 @@ class LiteIndex {
// verbosity > 0, more detailed debug information from the lexicon.
void GetDebugInfo(int verbosity, std::string* out) const;
+ // Returns the byte size of all the elements held in the index. This excludes
+ // the size of any internal metadata of the index, e.g. the index's header.
+ //
+ // Returns:
+ // Byte size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
private:
static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
@@ -228,15 +236,29 @@ class LiteIndex {
// hit buffer if term_id is not present.
uint32_t Seek(uint32_t term_id);
+ // File descriptor that points to where the header and hit buffer are written
+ // to.
ScopedFd hit_buffer_fd_;
+ // Mmapped region past the header that stores the hits.
IcingArrayStorage hit_buffer_;
+
+ // Crc checksum of the hits, excludes the header.
uint32_t hit_buffer_crc_;
+
+ // Trie that maps indexed terms to their term id
IcingDynamicTrie lexicon_;
+
// TODO(b/140437260): Port over to MemoryMappedFile
+ // Memory mapped region of the underlying file that reflects the header.
IcingMMapper header_mmap_;
+
+ // Wrapper around the mmapped header that contains stats on the lite index.
std::unique_ptr<IcingLiteIndex_Header> header_;
+
+ // Options used to initialize the LiteIndex.
const Options options_;
+
// TODO(b/139087650) Move to icing::Filesystem
const IcingFilesystem* const filesystem_;
};
diff --git a/icing/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc
index 109f717..b1b5420 100644
--- a/icing/icing-search-engine-jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc
@@ -16,6 +16,7 @@
#include <string>
+#include "icing/jni/jni-cache.h"
#include <google/protobuf/message_lite.h>
#include "icing/absl_ports/status_imports.h"
#include "icing/icing-search-engine.h"
@@ -26,6 +27,7 @@
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
+#include "icing/util/status-macros.h"
namespace {
bool ParseProtoFromJniByteArray(JNIEnv* env, jbyteArray bytes,
@@ -85,8 +87,12 @@ Java_com_google_android_icing_IcingSearchEngine_nativeCreate(
return 0;
}
+ std::unique_ptr<const icing::lib::JniCache> jni_cache;
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+ ICING_ASSIGN_OR_RETURN(jni_cache, icing::lib::JniCache::Create(env), 0);
+#endif // ICING_REVERSE_JNI_SEGMENTATION
icing::lib::IcingSearchEngine* icing =
- new icing::lib::IcingSearchEngine(options);
+ new icing::lib::IcingSearchEngine(options, std::move(jni_cache));
return reinterpret_cast<jlong>(icing);
}
@@ -282,6 +288,18 @@ Java_com_google_android_icing_IcingSearchEngine_nativeOptimize(
}
JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo(
+ JNIEnv* env, jclass clazz, jlong native_pointer) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(native_pointer);
+
+ icing::lib::GetOptimizeInfoResultProto get_optimize_info_result_proto =
+ icing->GetOptimizeInfo();
+
+ return SerializeProtoToJniByteArray(env, get_optimize_info_result_proto);
+}
+
+JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativeReset(
JNIEnv* env, jclass clazz, jlong native_pointer) {
icing::lib::IcingSearchEngine* icing =
diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc
new file mode 100644
index 0000000..a186222
--- /dev/null
+++ b/icing/jni/jni-cache.cc
@@ -0,0 +1,216 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/jni-cache.h"
+
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+JniCache::JniCache(JavaVM* jvm)
+ : jvm(jvm),
+ string_class(nullptr, jvm),
+ string_utf8(nullptr, jvm),
+ locale_class(nullptr, jvm),
+ locale_us(nullptr, jvm),
+ breakiterator_class(nullptr, jvm) {}
+
+// The macros below are intended to reduce the boilerplate in Create and avoid
+// easily introduced copy/paste errors.
+#define ICING_GET_CLASS_OR_RETURN_NULL(FIELD, NAME) \
+ { \
+ ICING_ASSIGN_OR_RETURN( \
+ libtextclassifier3::ScopedLocalRef<jclass> clazz, \
+ libtextclassifier3::JniHelper::FindClass(env, NAME), nullptr); \
+ result->FIELD##_class = \
+ libtextclassifier3::MakeGlobalRef(clazz.get(), env, jvm); \
+ if (result->FIELD##_class == nullptr) { \
+ ICING_LOG(ERROR) << "Error finding class: " << NAME; \
+ return nullptr; \
+ } \
+ }
+
+#define ICING_GET_OPTIONAL_CLASS(FIELD, NAME) \
+ { \
+ libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jclass>> \
+ status_or_class = libtextclassifier3::JniHelper::FindClass(env, NAME); \
+ if (status_or_class.ok()) { \
+ result->FIELD##_class = libtextclassifier3::MakeGlobalRef( \
+ std::move(status_or_class).ValueOrDie().get(), env, jvm); \
+ } \
+ }
+
+#define ICING_GET_METHOD(CLASS, FIELD, NAME, SIGNATURE) \
+ result->CLASS##_##FIELD = \
+ env->GetMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+ if (!result->CLASS##_##FIELD) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding method: " << NAME; \
+ return absl_ports::AbortedError("Unable to get Java method."); \
+ }
+
+#define ICING_GET_OPTIONAL_STATIC_METHOD(CLASS, FIELD, NAME, SIGNATURE) \
+ if (result->CLASS##_class != nullptr) { \
+ result->CLASS##_##FIELD = \
+ env->GetStaticMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+ env->ExceptionClear(); \
+ }
+
+#define ICING_GET_STATIC_METHOD(CLASS, FIELD, NAME, SIGNATURE) \
+ result->CLASS##_##FIELD = \
+ env->GetStaticMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+ if (!result->CLASS##_##FIELD) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding method: " << NAME; \
+ return absl_ports::AbortedError("Unable to get Java static method."); \
+ }
+
+#define ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL(CLASS, FIELD, NAME, \
+ SIGNATURE) \
+ { \
+ const jfieldID CLASS##_##FIELD##_field = \
+ env->GetStaticFieldID(result->CLASS##_class.get(), NAME, SIGNATURE); \
+ if (!CLASS##_##FIELD##_field) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding field id: " << NAME; \
+ return absl_ports::AbortedError("Unable to get Java field id."); \
+ } \
+ ICING_ASSIGN_OR_RETURN( \
+ libtextclassifier3::ScopedLocalRef<jobject> static_object, \
+ libtextclassifier3::JniHelper::GetStaticObjectField( \
+ env, result->CLASS##_class.get(), CLASS##_##FIELD##_field), \
+ nullptr); \
+ result->CLASS##_##FIELD = \
+ libtextclassifier3::MakeGlobalRef(static_object.get(), env, jvm); \
+ if (result->CLASS##_##FIELD == nullptr) { \
+ ICING_LOG(ERROR) << "Error finding field: " << NAME; \
+ return nullptr; \
+ } \
+ }
+
+#define ICING_GET_STATIC_INT_FIELD(CLASS, FIELD, NAME) \
+ const jfieldID CLASS##_##FIELD##_field = \
+ env->GetStaticFieldID(result->CLASS##_class.get(), NAME, "I"); \
+ << "Error finding field id: " << NAME; \
+ if (!CLASS##_##FIELD##_field) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding field id: " << NAME; \
+ return absl_ports::AbortedError( \
+ "Unable to get Java static int field id."); \
+ } \
+ result->CLASS##_##FIELD = env->GetStaticIntField( \
+ result->CLASS##_class.get(), CLASS##_##FIELD##_field); \
+ if (!result->CLASS##_##FIELD) { \
+ ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \
+ << "Error finding field: " << NAME; \
+ return absl_ports::AbortedError("Unable to get Java static int field."); \
+ }
+
+libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> JniCache::Create(
+ JNIEnv* env) {
+ if (env == nullptr) {
+ return nullptr;
+ }
+ JavaVM* jvm = nullptr;
+ if (JNI_OK != env->GetJavaVM(&jvm) || jvm == nullptr) {
+ return nullptr;
+ }
+ std::unique_ptr<JniCache> result(new JniCache(jvm));
+
+ // String
+ ICING_GET_CLASS_OR_RETURN_NULL(string, "java/lang/String");
+ ICING_GET_METHOD(string, constructor, "<init>", "([BLjava/lang/String;)V");
+ ICING_GET_METHOD(string, code_point_count, "codePointCount", "(II)I");
+ ICING_GET_METHOD(string, length, "length", "()I");
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jstring> result_string,
+ libtextclassifier3::JniHelper::NewStringUTF(env, "UTF-8"), nullptr);
+ result->string_utf8 =
+ libtextclassifier3::MakeGlobalRef(result_string.get(), env, jvm);
+ if (result->string_utf8 == nullptr) {
+ return nullptr;
+ }
+
+ // Locale
+ ICING_GET_CLASS_OR_RETURN_NULL(locale, "java/util/Locale");
+ ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL(locale, us, "US",
+ "Ljava/util/Locale;");
+ ICING_GET_METHOD(locale, constructor, "<init>", "(Ljava/lang/String;)V");
+ ICING_GET_OPTIONAL_STATIC_METHOD(locale, for_language_tag, "forLanguageTag",
+ "(Ljava/lang/String;)Ljava/util/Locale;");
+
+ // BreakIteratorBatcher
+ ICING_GET_CLASS_OR_RETURN_NULL(
+ breakiterator,
+ "com/google/android/libraries/mdi/search/BreakIteratorBatcher");
+ ICING_GET_METHOD(breakiterator, constructor, "<init>",
+ "(Ljava/util/Locale;)V");
+ ICING_GET_METHOD(breakiterator, settext, "setText", "(Ljava/lang/String;)V");
+ ICING_GET_METHOD(breakiterator, next, "next", "(I)[I");
+ ICING_GET_METHOD(breakiterator, first, "first", "()I");
+ ICING_GET_METHOD(breakiterator, following, "following", "(I)I");
+ ICING_GET_METHOD(breakiterator, preceding, "preceding", "(I)I");
+
+ return result;
+}
+
+#undef ICING_GET_STATIC_INT_FIELD
+#undef ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL
+#undef ICING_GET_STATIC_METHOD
+#undef ICING_GET_METHOD
+#undef ICING_GET_CLASS_OR_RETURN_NULL
+#undef ICING_GET_OPTIONAL_CLASS
+
+JNIEnv* JniCache::GetEnv() const {
+ void* env;
+ if (JNI_OK == jvm->GetEnv(&env, JNI_VERSION_1_4)) {
+ return reinterpret_cast<JNIEnv*>(env);
+ } else {
+ ICING_LOG(ERROR) << "Icing JniCache used on unattached thread";
+ return nullptr;
+ }
+}
+
+bool JniCache::ExceptionCheckAndClear() const {
+ return libtextclassifier3::JniExceptionCheckAndClear(GetEnv());
+}
+
+libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jstring>>
+JniCache::ConvertToJavaString(const char* utf8_text,
+ const int utf8_text_size_bytes) const {
+ // Create java byte array.
+ JNIEnv* jenv = GetEnv();
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jbyteArray> text_java_utf8,
+ libtextclassifier3::JniHelper::NewByteArray(jenv, utf8_text_size_bytes));
+
+ jenv->SetByteArrayRegion(text_java_utf8.get(), 0, utf8_text_size_bytes,
+ reinterpret_cast<const jbyte*>(utf8_text));
+
+ // Create the string with a UTF-8 charset.
+ ICING_ASSIGN_OR_RETURN(libtextclassifier3::ScopedLocalRef<jstring> result,
+ libtextclassifier3::JniHelper::NewObject<jstring>(
+ jenv, string_class.get(), string_constructor,
+ text_java_utf8.get(), string_utf8.get()));
+
+ return result;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h
new file mode 100644
index 0000000..a5f16c7
--- /dev/null
+++ b/icing/jni/jni-cache.h
@@ -0,0 +1,78 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JNI_JNI_CACHE_H_
+#define ICING_JNI_JNI_CACHE_H_
+
+#include <jni.h>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+namespace icing {
+namespace lib {
+
+// A helper class to cache class and method pointers for calls from JNI to Java.
+// (for implementations such as Java ICU that need to make calls from C++ to
+// Java)
+struct JniCache {
+ static libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> Create(
+ JNIEnv* env);
+
+ // Returns the correct JNIEnv of the current thread. This allows multiple
+ // threads, each accessing the same instance of JniCache, to retrieve their
+ // unique JNIEnv pointers.
+ JNIEnv* GetEnv() const;
+
+ // Returns true if there are any pending exceptions from the execution of JNI
+ // calls. Also clears the exception if any existed.
+ bool ExceptionCheckAndClear() const;
+
+ JavaVM* jvm = nullptr;
+
+ // java.lang.String
+ libtextclassifier3::ScopedGlobalRef<jclass> string_class;
+ jmethodID string_constructor = nullptr;
+ jmethodID string_code_point_count = nullptr;
+ jmethodID string_length = nullptr;
+ libtextclassifier3::ScopedGlobalRef<jstring> string_utf8;
+
+ // java.util.Locale
+ libtextclassifier3::ScopedGlobalRef<jclass> locale_class;
+ libtextclassifier3::ScopedGlobalRef<jobject> locale_us;
+ jmethodID locale_constructor = nullptr;
+ jmethodID locale_for_language_tag = nullptr;
+
+ // BreakIteratorBatcher
+ libtextclassifier3::ScopedGlobalRef<jclass> breakiterator_class;
+ jmethodID breakiterator_constructor = nullptr;
+ jmethodID breakiterator_settext = nullptr;
+ jmethodID breakiterator_next = nullptr;
+ jmethodID breakiterator_first = nullptr;
+ jmethodID breakiterator_following = nullptr;
+ jmethodID breakiterator_preceding = nullptr;
+
+ // Helper to convert lib3 UnicodeText to Java strings.
+ libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jstring>>
+ ConvertToJavaString(const char* utf8_text,
+ const int utf8_text_size_bytes) const;
+
+ private:
+ explicit JniCache(JavaVM* jvm);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JNI_JNI_CACHE_H_
diff --git a/icing/jni/jni.lds b/icing/jni/jni.lds
new file mode 100644
index 0000000..401682a
--- /dev/null
+++ b/icing/jni/jni.lds
@@ -0,0 +1,10 @@
+VERS_1.0 {
+ # Export JNI symbols.
+ global:
+ Java_*;
+ JNI_OnLoad;
+
+ # Hide everything else
+ local:
+ *;
+};
diff --git a/icing/jni/reverse-jni-break-iterator.cc b/icing/jni/reverse-jni-break-iterator.cc
new file mode 100644
index 0000000..2a589c6
--- /dev/null
+++ b/icing/jni/reverse-jni-break-iterator.cc
@@ -0,0 +1,187 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/reverse-jni-break-iterator.h"
+
+#include <math.h>
+
+#include <cassert>
+#include <cctype>
+#include <map>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/status-macros.h"
+#include <jni.h>
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Chosen based on results in go/reverse-jni-benchmarks
+static constexpr int kBatchSize = 100;
+} // namespace
+
+// -----------------------------------------------------------------------------
+// Implementations that call out to JVM. Behold the beauty.
+// -----------------------------------------------------------------------------
+libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+ReverseJniBreakIterator::Create(const JniCache* jni_cache,
+ std::string_view text,
+ std::string_view locale) {
+ if (jni_cache == nullptr) {
+ return absl_ports::InvalidArgumentError(
+ "Create must be called with a valid JniCache pointer!");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jstring> java_text,
+ jni_cache->ConvertToJavaString(text.data(), text.length()));
+ if (java_text.get() == nullptr) {
+ return absl_ports::AbortedError("Failed to create Java String from input.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jstring> java_locale_string,
+ jni_cache->ConvertToJavaString(locale.data(), locale.length()));
+ if (java_locale_string.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java String from locale.");
+ }
+
+ JNIEnv* jenv = jni_cache->GetEnv();
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jobject> java_locale,
+ libtextclassifier3::JniHelper::NewObject(
+ jenv, jni_cache->locale_class.get(), jni_cache->locale_constructor,
+ java_locale_string.get()));
+ if (java_locale.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java Locale from locale.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jobject> local_iterator_batcher,
+ libtextclassifier3::JniHelper::NewObject(
+ jenv, jni_cache->breakiterator_class.get(),
+ jni_cache->breakiterator_constructor, java_locale.get()));
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher =
+ libtextclassifier3::MakeGlobalRef(local_iterator_batcher.get(), jenv,
+ jni_cache->jvm);
+ if (iterator_batcher.get() == nullptr) {
+ return absl_ports::AbortedError(
+ "Failed to create Java BreakIteratorBatcher.");
+ }
+
+ ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod(
+ jenv, iterator_batcher.get(), jni_cache->breakiterator_settext,
+ java_text.get()));
+ return std::unique_ptr<ReverseJniBreakIterator>(
+ new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher)));
+}
+
+ReverseJniBreakIterator::ReverseJniBreakIterator(
+ const JniCache* jni_cache,
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher)
+ : jni_cache_(jni_cache),
+ iterator_batcher_(std::move(iterator_batcher)),
+ is_done_(false),
+ is_almost_done_(false) {}
+
+int ReverseJniBreakIterator::Next() {
+ if (is_done_) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ if (break_indices_cache_.empty()) {
+ if (FetchNextBatch() == ReverseJniBreakIterator::kDone) {
+ // Either there were no more results or an error occurred. Either way,
+ // mark ourselves as done and return.
+ is_done_ = true;
+ return ReverseJniBreakIterator::kDone;
+ }
+ is_almost_done_ = break_indices_cache_.size() < kBatchSize;
+ }
+ int break_index = break_indices_cache_.front();
+ break_indices_cache_.pop();
+ is_done_ = is_almost_done_ && break_indices_cache_.empty();
+ return break_index;
+}
+
+int ReverseJniBreakIterator::First() {
+ const int first_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_first);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return first_index;
+}
+
+int ReverseJniBreakIterator::Preceding(int offset) {
+ const int preceding_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_preceding, offset);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return preceding_index;
+}
+
+int ReverseJniBreakIterator::Following(int offset) {
+ const int following_index = jni_cache_->GetEnv()->CallIntMethod(
+ iterator_batcher_.get(), jni_cache_->breakiterator_following, offset);
+ if (jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ ClearCache();
+ return following_index;
+}
+
+int ReverseJniBreakIterator::FetchNextBatch() {
+ ICING_ASSIGN_OR_RETURN(
+ libtextclassifier3::ScopedLocalRef<jintArray> break_indices,
+ libtextclassifier3::JniHelper::CallObjectMethod<jintArray>(
+ jni_cache_->GetEnv(), iterator_batcher_.get(),
+ jni_cache_->breakiterator_next, kBatchSize),
+ ReverseJniBreakIterator::kDone);
+ if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ jint num_indices = jni_cache_->GetEnv()->GetArrayLength(break_indices.get());
+ if (num_indices == 0) {
+ return ReverseJniBreakIterator::kDone;
+ }
+ jint* break_indices_arr =
+ static_cast<jint*>(jni_cache_->GetEnv()->GetPrimitiveArrayCritical(
+ break_indices.get(), nullptr));
+ for (int i = 0; i < num_indices; ++i) {
+ break_indices_cache_.push(break_indices_arr[i]);
+ }
+ jni_cache_->GetEnv()->ReleasePrimitiveArrayCritical(break_indices.get(),
+ break_indices_arr,
+ /*mode=*/0);
+ return num_indices;
+}
+
+void ReverseJniBreakIterator::ClearCache() {
+ break_indices_cache_ = std::queue<int>();
+ is_done_ = false;
+ is_almost_done_ = false;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/jni/reverse-jni-break-iterator.h b/icing/jni/reverse-jni-break-iterator.h
new file mode 100644
index 0000000..c1f05f4
--- /dev/null
+++ b/icing/jni/reverse-jni-break-iterator.h
@@ -0,0 +1,124 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+#define ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+
+#include <jni.h>
+
+#include <queue>
+#include <string>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+namespace icing {
+namespace lib {
+
+// A class that handles the cross-JNI interactions with BreakIteratorBatcher and
+// hides the batching element to provide an interface akin to
+// java.text.BreakIterator.
+//
+// Example:
+// std::string text = "我每天走路去上班。";
+// ASSERT_THAT(text, SizeIs(27));
+// std::unique_ptr<ReverseJniBreakIterator> itr =
+// ReverseJniBreakIterator::Create(jni_cache, text, locale);
+// std::vector<int> nexts;
+// int next = itr->Next();
+// while (next != ReverseJniBreakIterator::kDone) {
+// nexts.push_back(next);
+// next = itr->Next();
+// }
+// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8));
+class ReverseJniBreakIterator {
+ public:
+ static constexpr int kDone = -1;
+
+ // Creates a ReverseJniBreakiterator with the given text and locale.
+ //
+ // Returns:
+ // A ReverseJniBreakIterator on success
+ // INVALID_ARGUMENT if jni_cache isn't a valid JniCache pointer
+ // INTERNAL if unable to create any of the required Java objects
+ static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
+ Create(const JniCache* jni_cache, std::string_view text,
+ std::string_view locale);
+
+ // Returns the UTF-16 boundary following the current boundary. If the current
+ // boundary is the last text boundary, it returns
+ // ReverseJniBreakIterator::kDONE.
+ //
+ // NOTE: The 'boundary' refers to the UTF-16 boundary - NOT the UTF-8
+ // boundary. Callers interested in the UTF-8 boundary are required to maintain
+ // whatever state is necessary to translate from UTF-16 to UTF-8 boundaries.
+ int Next();
+
+ // Returns the first UTF-16 boundary. The iterator's current position is set
+ // to the first text boundary and any cached data is cleared.
+ int First();
+
+ // Returns the position of the first UTF-16 boundary preceding the UTF-16
+ // offset. If there is no boundary preceding the specified offset, then
+ // ReverseJniBreakIterator::kDone is returned.
+ //
+ // The iterator's current position is set to the segment whose boundary was
+ // returned and any cached data is cleared.
+ int Preceding(int offset);
+
+ // Returns the position of the first UTF-16 boundary following the UTF-16
+ // offset. If there is no boundary following the specified offset, then
+ // ReverseJniBreakIterator::kDone is returned.
+ //
+ // The iterator's current position is set to the segment whose boundary
+ // was returned and any cached data is cleared.
+ int Following(int offset);
+
+ private:
+ ReverseJniBreakIterator(
+ const JniCache* jni_cache,
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher);
+
+ // Fetches the results of up to kBatchSize next calls and stores them in
+ // break_indices_cache_. Returns the number of results or kDone if no more
+ // results could be fetched.
+ int FetchNextBatch();
+
+ // Empties the cache and sets is_done_ and is_almost_done_ to false.
+ void ClearCache();
+
+ // Keeps track of references to Java classes and methods. Does NOT own.
+ const JniCache* jni_cache_;
+
+ // The reference to the actual instance of BreakIteratorBatcher that
+ // this class interacts with.
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher_;
+
+ // The cache holding the most recent batch of return values from
+ // BreakIteratorBatcher#next.
+ std::queue<int> break_indices_cache_;
+
+ bool is_done_;
+
+ // The last batch was incomplete (< kBatchSize results were returned). The
+ // next call to BreakIteratorBatcher#next is guaranteed to return an
+ // empty array. Once the results from the last batch are evicted from
+ // break_indices_cache, ReverseJniBreakIterator will transition to is_done_.
+ bool is_almost_done_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index 960d003..ee3d3a2 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -11,9 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
//
// We store the trie in three areas: nodes, nexts and suffixes.
//
@@ -84,7 +81,7 @@
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-flash-bitmap.h"
#include "icing/legacy/index/icing-mmapper.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
#include "icing/util/logging.h"
#include "icing/util/math-util.h"
@@ -250,6 +247,11 @@ class IcingDynamicTrie::IcingDynamicTrieStorage {
const IcingFilesystem &filesystem);
bool Sync();
uint64_t GetDiskUsage() const;
+
+ // Returns the size of the elements held in the trie. This excludes the size
+ // of any internal metadata of the trie, e.g. the trie's header.
+ uint64_t GetElementsFileSize() const;
+
void Warm();
void Clear();
@@ -696,6 +698,18 @@ uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetDiskUsage() const {
return total;
}
+uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetElementsFileSize()
+ const {
+ // Trie files themselves, exclude size of the header. These arrays are dense,
+ // not sparse, so use file size for more accurate numbers.
+ uint64_t total = 0;
+ for (int i = 0; i < NUM_ARRAY_TYPES; i++) {
+ IcingFilesystem::IncrementByOrSetInvalid(
+ filesystem_->GetFileSize(array_fds_[i].get()), &total);
+ }
+ return total;
+}
+
IcingDynamicTrie::Node *IcingDynamicTrie::IcingDynamicTrieStorage::AllocNode() {
if (nodes_left() == 0) {
ICING_LOG(FATAL) << "No allocated nodes left";
@@ -1154,6 +1168,30 @@ uint64_t IcingDynamicTrie::GetDiskUsage() const {
return total;
}
+uint64_t IcingDynamicTrie::GetElementsSize() const {
+ uint64_t total = 0;
+
+ // Bitmaps are sparsely populated, so disk usage is more accurate for those.
+ // Property bitmaps.
+ IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(),
+ &total);
+ // The deleted bitmap is always initially grown to kGrowSize, whether there
+ // are elements or not. So even if there are no elements in the trie, we'll
+ // still have the bitmap of size kGrowSize, so subtract that from the size of
+ // the trie's elements.
+ total -= IcingFlashBitmap::kGrowSize;
+
+ for (auto &bitmap : property_bitmaps_) {
+ if (bitmap == nullptr) continue;
+ IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total);
+ }
+
+ // Storage. We can use file size here since the storage files aren't sparse.
+ IcingFilesystem::IncrementByOrSetInvalid(storage_->GetElementsFileSize(),
+ &total);
+ return total;
+}
+
std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap(
const std::string &filename, bool verify,
const IcingFilesystem *filesystem) {
@@ -1868,7 +1906,7 @@ void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() {
// If we start with non-ascii, take all left branches while there is
// a continuation byte.
- if (!icu_i18n_utils::IsAscii(cur_[cur_len_ - 1])) {
+ if (!i18n_utils::IsAscii(cur_[cur_len_ - 1])) {
while (!node->is_leaf()) {
if (cur_len_ >= U8_MAX_LENGTH) break;
@@ -1877,8 +1915,8 @@ void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() {
if (branch_end_->child->val() == 0) {
// Check if we already have a valid cur_.
cur_[cur_len_] = 0;
- UChar32 uchar32 = icu_i18n_utils::GetUChar32At(cur_, cur_len_, 0);
- if (uchar32 == icu_i18n_utils::kInvalidUChar32 &&
+ UChar32 uchar32 = i18n_utils::GetUChar32At(cur_, cur_len_, 0);
+ if (uchar32 == i18n_utils::kInvalidUChar32 &&
node->log2_num_children() > 0) {
branch_end_->child++;
} else {
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index 6b39c56..7136ef8 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -48,7 +48,8 @@
#include "icing/legacy/index/icing-mmapper.h"
#include "icing/legacy/index/icing-storage.h"
#include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/utf8.h"
namespace icing {
namespace lib {
@@ -265,6 +266,10 @@ class IcingDynamicTrie : public IIcingStorage {
bool Remove() override;
uint64_t GetDiskUsage() const override;
+ // Returns the size of the elements held in the trie. This excludes the size
+ // of any internal metadata of the trie, e.g. the trie's header.
+ uint64_t GetElementsSize() const;
+
// REQUIRED: For all functions below is_initialized() == true.
// Number of keys in trie.
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
index 9abd369..3b3521a 100644
--- a/icing/legacy/index/icing-flash-bitmap.h
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -11,9 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
-// Copyright 2012 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
//
// A disk-backed bitmap.
//
diff --git a/icing/proto/document.proto b/icing/proto/document.proto
index 0a8b6f8..bed33b0 100644
--- a/icing/proto/document.proto
+++ b/icing/proto/document.proto
@@ -20,6 +20,7 @@ import "icing/proto/status.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Defines a unit of data understood by the IcingSearchEngine.
// Next tag: 9
@@ -108,6 +109,7 @@ message PutResultProto {
message GetResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// NOT_FOUND
// INTERNAL
//
@@ -127,6 +129,7 @@ message GetResultProto {
message DeleteResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// NOT_FOUND
// INTERNAL
//
@@ -142,6 +145,7 @@ message DeleteResultProto {
message DeleteByNamespaceResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// NOT_FOUND
// INTERNAL
//
@@ -157,6 +161,7 @@ message DeleteByNamespaceResultProto {
message DeleteBySchemaTypeResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// NOT_FOUND
// INTERNAL
//
diff --git a/icing/proto/document_wrapper.proto b/icing/proto/document_wrapper.proto
index 0666e72..e8eb992 100644
--- a/icing/proto/document_wrapper.proto
+++ b/icing/proto/document_wrapper.proto
@@ -21,6 +21,8 @@ import "icing/proto/document.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// DocumentWrapper as a wrapper of the user-facing DocumentProto is meant to
// be used by icing team internally. It stores the original document provided
// by library users and metadata of the document which shouldn't be exposed to
diff --git a/icing/proto/initialize.proto b/icing/proto/initialize.proto
index 813cdb5..eac88e6 100644
--- a/icing/proto/initialize.proto
+++ b/icing/proto/initialize.proto
@@ -21,6 +21,8 @@ import "icing/proto/status.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Next tag: 5
message IcingSearchEngineOptions {
// Directory to persist files for Icing. Required.
diff --git a/icing/proto/optimize.proto b/icing/proto/optimize.proto
index 2bf28e8..1baa64c 100644
--- a/icing/proto/optimize.proto
+++ b/icing/proto/optimize.proto
@@ -20,12 +20,14 @@ import "icing/proto/status.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Result of a call to IcingSearchEngine.Optimize
// Next tag: 2
message OptimizeResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// WARNING_DATA_LOSS
// ABORTED
// INTERNAL
@@ -36,3 +38,23 @@ message OptimizeResultProto {
// TODO(b/147699081): Add a field to indicate lost_schema and lost_documents.
// go/icing-library-apis.
}
+
+// Result of a call to IcingSearchEngine.GetOptimizeInfo
+// Next tag: 4
+message GetOptimizeInfoResultProto {
+ // Status code can be one of:
+ // OK
+ // FAILED_PRECONDITION
+ // INTERNAL
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+
+ // Documents that have expired or been deleted, but are still taking up space
+ // in IcingSearchEngine.
+ optional int64 optimizable_docs = 2;
+
+ // Estimated bytes that could be recovered. The exact size per document isn't
+ // tracked, so this is based off an average document size.
+ optional int64 estimated_optimizable_bytes = 3;
+}
diff --git a/icing/proto/persist.proto b/icing/proto/persist.proto
index 5b5a737..77cf987 100644
--- a/icing/proto/persist.proto
+++ b/icing/proto/persist.proto
@@ -20,12 +20,14 @@ import "icing/proto/status.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Result of a call to IcingSearchEngine.Persist
// Next tag: 2
message PersistToDiskResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// INTERNAL
//
// See status.proto for more details.
diff --git a/icing/proto/reset.proto b/icing/proto/reset.proto
index 9a7fa9a..5e8b9f5 100644
--- a/icing/proto/reset.proto
+++ b/icing/proto/reset.proto
@@ -21,6 +21,8 @@ import "icing/proto/status.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Result of a call to IcingSearchEngine.Reset
// Next tag: 2
message ResetResultProto {
diff --git a/icing/proto/schema.proto b/icing/proto/schema.proto
index cabccaa..3a7ee5d 100644
--- a/icing/proto/schema.proto
+++ b/icing/proto/schema.proto
@@ -21,6 +21,7 @@ import "icing/proto/term.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Defines the schema that every Document of a specific "type" should adhere
// to. These can be considered as definitions of rich structured types for
@@ -204,6 +205,7 @@ message SetSchemaResultProto {
message GetSchemaResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// NOT_FOUND
// INTERNAL
//
diff --git a/icing/proto/scoring.proto b/icing/proto/scoring.proto
index ad536b4..667ff4f 100644
--- a/icing/proto/scoring.proto
+++ b/icing/proto/scoring.proto
@@ -19,6 +19,8 @@ package icing.lib;
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Encapsulates the configurations on how Icing should score and rank the search
// results.
// Next tag: 3
@@ -26,9 +28,8 @@ message ScoringSpecProto {
// OPTIONAL: Indicates how the search results will be ranked.
message RankingStrategy {
enum Code {
- // No ranking strategy specified, documents will be returned in the
- // default order that the most recent document inserted into Icing comes
- // first.
+ // No ranking strategy specified, documents may be returned in an
+ // arbitrary order.
NONE = 0;
// Ranked by user-provided document scores.
diff --git a/icing/proto/search.proto b/icing/proto/search.proto
index 085575a..8ea5036 100644
--- a/icing/proto/search.proto
+++ b/icing/proto/search.proto
@@ -22,6 +22,7 @@ import "icing/proto/term.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
// Client-supplied specifications on what documents to retrieve.
// Next tag: 5
@@ -148,6 +149,7 @@ message SnippetProto {
message SearchResultProto {
// Status code can be one of:
// OK
+ // FAILED_PRECONDITION
// INVALID_ARGUMENT
// ABORTED
// INTERNAL
diff --git a/icing/proto/status.proto b/icing/proto/status.proto
index 418b2e8..2733a15 100644
--- a/icing/proto/status.proto
+++ b/icing/proto/status.proto
@@ -19,6 +19,8 @@ package icing.lib;
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Canonical status to indicate the results of API calls.
// Next tag: 3
message StatusProto {
diff --git a/icing/proto/term.proto b/icing/proto/term.proto
index 30cd1bc..adf2ad6 100644
--- a/icing/proto/term.proto
+++ b/icing/proto/term.proto
@@ -19,6 +19,8 @@ package icing.lib;
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
// Encapsulates the configurations on how Icing should query/index these terms.
// Next tag: 0
message TermMatchType {
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index 5775e83..000bf3a 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -16,7 +16,7 @@
#include "gmock/gmock.h"
#include "third_party/absl/flags/flag.h"
#include "icing/document-builder.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index.h"
#include "icing/proto/term.pb.h"
#include "icing/query/query-processor.h"
@@ -80,7 +80,7 @@ std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem,
std::unique_ptr<Normalizer> CreateNormalizer() {
return normalizer_factory::Create(
- normalizer_factory::NormalizerType::ICU4C,
+
/*max_term_byte_size=*/std::numeric_limits<int>::max())
.ValueOrDie();
}
@@ -108,8 +108,7 @@ void BM_QueryOneTerm(benchmark::State& state) {
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -221,8 +220,7 @@ void BM_QueryFiveTerms(benchmark::State& state) {
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -352,8 +350,7 @@ void BM_QueryDiacriticTerm(benchmark::State& state) {
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -468,8 +465,7 @@ void BM_QueryHiragana(benchmark::State& state) {
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index 99a552e..7dfc326 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -22,7 +22,7 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
@@ -102,14 +102,11 @@ class QueryProcessorTest : public Test {
ICING_ASSERT_OK_AND_ASSIGN(index_,
Index::Create(options, &icing_filesystem_));
- ICING_ASSERT_OK_AND_ASSIGN(
- language_segmenter_,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+ language_segmenter_factory::Create());
- ICING_ASSERT_OK_AND_ASSIGN(
- normalizer_,
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/1000));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
}
libtextclassifier3::Status AddTokenToIndex(
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index cfce6e2..36dbfd9 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -20,7 +20,7 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
@@ -59,16 +59,13 @@ class ResultRetrieverTest : public testing::Test {
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
- ICING_ASSERT_OK_AND_ASSIGN(
- language_segmenter_,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
SchemaStore::Create(&filesystem_, test_dir_));
- ICING_ASSERT_OK_AND_ASSIGN(
- normalizer_,
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/10000));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
SchemaProto schema;
auto type_config = schema.add_types();
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index faf9e18..09d0f7a 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -35,7 +35,7 @@
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/transform/normalizer.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -126,19 +126,18 @@ libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
// Returns true if token matches any of the terms in query terms according to
// the provided match type.
-
+//
// Returns:
// the position of the window start if successful
// INTERNAL_ERROR - if a tokenizer error is encountered
libtextclassifier3::StatusOr<int> DetermineWindowStart(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
- int window_start_min =
- std::max((match_mid - snippet_spec.max_window_bytes() / 2), 0);
- if (window_start_min == 0) {
+ int window_start_min = (match_mid - snippet_spec.max_window_bytes() / 2) - 1;
+ if (window_start_min < 0) {
return 0;
}
- if (!iterator->ResetToTokenAfter(window_start_min - 1)) {
+ if (!iterator->ResetToTokenAfter(window_start_min)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
@@ -152,8 +151,7 @@ int IncludeTrailingPunctuation(std::string_view value, int window_end_exclusive,
int window_end_max_exclusive) {
while (window_end_exclusive < window_end_max_exclusive) {
int char_len = 0;
- if (!icu_i18n_utils::IsPunctuationAt(value, window_end_exclusive,
- &char_len)) {
+ if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive, &char_len)) {
break;
}
if (window_end_exclusive + char_len > window_end_max_exclusive) {
@@ -174,10 +172,9 @@ libtextclassifier3::StatusOr<int> DetermineWindowEnd(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
int window_end_max_exclusive =
- std::min((match_mid + snippet_spec.max_window_bytes() / 2),
- static_cast<int>(value.length()));
- if (window_end_max_exclusive == value.length()) {
- return window_end_max_exclusive;
+ match_mid + snippet_spec.max_window_bytes() / 2;
+ if (window_end_max_exclusive >= value.length()) {
+ return value.length();
}
if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) {
return absl_ports::InternalError(
@@ -228,8 +225,11 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
iterator));
snippet_match.set_window_bytes(window_end_exclusive - window_start);
- // Reset the iterator back to the original position.
- if (!iterator->ResetToTokenAfter(match_pos - 1)) {
+ // DetermineWindowStart/End may change the position of the iterator. So,
+ // reset the iterator back to the original position.
+ bool success = (match_pos > 0) ? iterator->ResetToTokenAfter(match_pos - 1)
+ : iterator->ResetToStart();
+ if (!success) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 7037ede..3b3bf61 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,7 +22,7 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
@@ -60,9 +60,8 @@ class SnippetRetrieverTest : public testing::Test {
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
- ICING_ASSERT_OK_AND_ASSIGN(
- language_segmenter_,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
+ language_segmenter_factory::Create());
// Setup the schema
ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
@@ -88,10 +87,8 @@ class SnippetRetrieverTest : public testing::Test {
IndexingConfig::TokenizerType::PLAIN);
ICING_ASSERT_OK(schema_store_->SetSchema(schema));
- ICING_ASSERT_OK_AND_ASSIGN(
- normalizer_,
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/10000));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
ICING_ASSERT_OK_AND_ASSIGN(
snippet_retriever_,
SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc
index df5a820..7413d73 100644
--- a/icing/schema/schema-util.cc
+++ b/icing/schema/schema-util.cc
@@ -256,23 +256,29 @@ void SchemaUtil::BuildTypeConfigMap(
}
}
-void SchemaUtil::BuildPropertyConfigMap(
- const SchemaTypeConfigProto& type_config,
- std::unordered_map<std::string_view, const PropertyConfigProto*>*
- property_config_map,
- int32_t* num_required_properties) {
+SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs(
+ const SchemaTypeConfigProto& type_config) {
+ ParsedPropertyConfigs parsed_property_configs;
+
// TODO(samzheng): consider caching property_config_map for some properties,
// e.g. using LRU cache. Or changing schema.proto to use go/protomap.
- *num_required_properties = 0;
- property_config_map->clear();
for (const PropertyConfigProto& property_config : type_config.properties()) {
- property_config_map->emplace(property_config.property_name(),
- &property_config);
+ parsed_property_configs.property_config_map.emplace(
+ property_config.property_name(), &property_config);
if (property_config.cardinality() ==
PropertyConfigProto::Cardinality::REQUIRED) {
- (*num_required_properties)++;
+ parsed_property_configs.num_required_properties++;
+ }
+
+ // A non-default term_match_type indicates that this property is meant to be
+ // indexed.
+ if (property_config.indexing_config().term_match_type() !=
+ TermMatchType::UNKNOWN) {
+ parsed_property_configs.num_indexed_properties++;
}
}
+
+ return parsed_property_configs;
}
const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
@@ -298,22 +304,21 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
continue;
}
- std::unordered_map<std::string_view, const PropertyConfigProto*>
- new_property_map;
- int32_t new_required_properties = 0;
- BuildPropertyConfigMap(new_schema_type_and_config->second,
- &new_property_map, &new_required_properties);
+ ParsedPropertyConfigs new_parsed_property_configs =
+ ParsePropertyConfigs(new_schema_type_and_config->second);
// We only need to check the old, existing properties to see if they're
// compatible since we'll have old data that may be invalidated or need to
- // be reindexed. New properties don't have any data that would be
- // invalidated or incompatible, so we blanket accept all new properties.
+ // be reindexed.
int32_t old_required_properties = 0;
+ int32_t old_indexed_properties = 0;
for (const auto& old_property_config : old_type_config.properties()) {
auto new_property_name_and_config =
- new_property_map.find(old_property_config.property_name());
+ new_parsed_property_configs.property_config_map.find(
+ old_property_config.property_name());
- if (new_property_name_and_config == new_property_map.end()) {
+ if (new_property_name_and_config ==
+ new_parsed_property_configs.property_config_map.end()) {
// Didn't find the old property
ICING_VLOG(1) << absl_ports::StrCat("Previously defined property type ",
old_type_config.schema_type(), ".",
@@ -340,6 +345,13 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
++old_required_properties;
}
+ // A non-default term_match_type indicates that this property is meant to
+ // be indexed.
+ if (old_property_config.indexing_config().term_match_type() !=
+ TermMatchType::UNKNOWN) {
+ ++old_indexed_properties;
+ }
+
// Any change in the indexed property requires a reindexing
if (!IsTermMatchTypeCompatible(old_property_config.indexing_config(),
new_property_config->indexing_config())) {
@@ -352,7 +364,8 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
// guaranteed from our previous checks that all the old properties are also
// present in the new property config, so we can do a simple int comparison
// here to detect new required properties.
- if (new_required_properties > old_required_properties) {
+ if (new_parsed_property_configs.num_required_properties >
+ old_required_properties) {
ICING_VLOG(1) << absl_ports::StrCat(
"New schema ", old_type_config.schema_type(),
" has REQUIRED properties that are not "
@@ -360,6 +373,18 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
schema_delta.schema_types_incompatible.insert(
old_type_config.schema_type());
}
+
+ // If we've gained any new indexed properties, then the section ids may
+ // change. Since the section ids are stored in the index, we'll need to
+ // reindex everything.
+ if (new_parsed_property_configs.num_indexed_properties >
+ old_indexed_properties) {
+ ICING_VLOG(1) << absl_ports::StrCat(
+ "Set of indexed properties in schema type '",
+ old_type_config.schema_type(),
+ "' has changed, required reindexing.");
+ schema_delta.index_incompatible = true;
+ }
}
return schema_delta;
diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h
index c547ad2..d65dd10 100644
--- a/icing/schema/schema-util.h
+++ b/icing/schema/schema-util.h
@@ -54,6 +54,18 @@ class SchemaUtil {
}
};
+ struct ParsedPropertyConfigs {
+ // Mapping of property name to PropertyConfigProto
+ std::unordered_map<std::string_view, const PropertyConfigProto*>
+ property_config_map;
+
+ // Total number of properties that have an indexing config
+ int32_t num_indexed_properties = 0;
+
+ // Total number of properties that were REQUIRED
+ int32_t num_required_properties = 0;
+ };
+
// This function validates:
// 1. SchemaTypeConfigProto.schema_type's must be unique
// 2. Properties within one SchemaTypeConfigProto must be unique
@@ -81,14 +93,10 @@ class SchemaUtil {
static void BuildTypeConfigMap(const SchemaProto& schema,
TypeConfigMap* type_config_map);
- // Calculate and return a hash map of (property name -> property config)
- // from the given type config. The number of required properties will be
- // assigned to output param num_required_properties.
- static void BuildPropertyConfigMap(
- const SchemaTypeConfigProto& type_config,
- std::unordered_map<std::string_view, const PropertyConfigProto*>*
- property_config_map,
- int32_t* num_required_properties);
+ // Parses the given type_config and returns a struct of easily-parseable
+ // information about the properties.
+ static ParsedPropertyConfigs ParsePropertyConfigs(
+ const SchemaTypeConfigProto& type_config);
// Computes the delta between the old and new schema. There are a few
// differences that'll be reported:
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
index 64473b8..a3ab96f 100644
--- a/icing/schema/schema-util_test.cc
+++ b/icing/schema/schema-util_test.cc
@@ -502,6 +502,40 @@ TEST_F(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) {
Eq(schema_delta));
}
+TEST_F(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) {
+ // Configure old schema
+ SchemaProto old_schema;
+ auto old_type = old_schema.add_types();
+ *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+ auto old_property = old_type->add_properties();
+ old_property->set_property_name("Property");
+ old_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // Configure new schema
+ SchemaProto new_schema;
+ auto new_type = new_schema.add_types();
+ *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+
+ auto new_property = new_type->add_properties();
+ new_property->set_property_name("Property");
+ new_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ new_property = new_type->add_properties();
+ new_property->set_property_name("NewIndexedProperty");
+ new_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ new_property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.index_incompatible = true;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ Eq(schema_delta));
+}
+
TEST_F(SchemaUtilTest, AddingTypeIsCompatible) {
// Can add a new type, existing data isn't incompatible, since none of them
// are of this new schema type
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index e2457d0..ae8360b 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -1235,6 +1235,59 @@ libtextclassifier3::Status DocumentStore::OptimizeInto(
return libtextclassifier3::Status::OK;
}
+libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
+DocumentStore::GetOptimizeInfo() const {
+ OptimizeInfo optimize_info;
+
+ // Figure out our ratio of optimizable/total docs.
+ int32_t num_documents = document_id_mapper_->num_elements();
+ for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
+ ++document_id) {
+ if (!DoesDocumentExist(document_id)) {
+ ++optimize_info.optimizable_docs;
+ }
+
+ ++optimize_info.total_docs;
+ }
+
+ if (optimize_info.total_docs == 0) {
+ // Can exit early since there's nothing to calculate.
+ return optimize_info;
+ }
+
+ // Get the total element size.
+ //
+ // We use file size instead of disk usage here because the files are not
+ // sparse, so it's more accurate. Disk usage rounds up to the nearest block
+ // size.
+ ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
+ document_log_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
+ document_id_mapper_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
+ score_cache_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
+ filter_cache_->GetElementsFileSize());
+
+ // We use a combined disk usage and file size for the KeyMapper because it's
+ // backed by a trie, which has some sparse property bitmaps.
+ ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
+ document_key_mapper_->GetElementsSize());
+
+ // We don't include the namespace mapper because it's not clear if we could
+ // recover any space even if Optimize were called. Deleting 100s of documents
+ // could still leave a few documents of a namespace, and then there would be
+ // no change.
+
+ int64_t total_size = document_log_file_size + document_key_mapper_size +
+ document_id_mapper_file_size + score_cache_file_size +
+ filter_cache_file_size;
+
+ optimize_info.estimated_optimizable_bytes =
+ total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
+ return optimize_info;
+}
+
libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
return score_cache_->Set(document_id, score_data);
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 891b199..3547214 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -54,6 +54,20 @@ class DocumentStore {
uint32_t checksum;
};
+ struct OptimizeInfo {
+ // The estimated size in bytes of the optimizable docs. We don't track the
+ // size of each document, so we estimate by taking the size of the entire
+ // DocumentStore and dividing that by the total number of documents we have.
+ // So we end up with an average document size.
+ int64_t estimated_optimizable_bytes = 0;
+
+ // Number of total documents the DocumentStore tracks.
+ int32_t total_docs = 0;
+
+ // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
+ int32_t optimizable_docs = 0;
+ };
+
// Not copyable
DocumentStore(const DocumentStore&) = delete;
DocumentStore& operator=(const DocumentStore&) = delete;
@@ -208,7 +222,8 @@ class DocumentStore {
// INTERNAL on I/O error
libtextclassifier3::Status PersistToDisk();
- // Calculates and returns the disk usage in bytes.
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
//
// Returns:
// Disk usage on success
@@ -273,6 +288,15 @@ class DocumentStore {
// INTERNAL_ERROR on IO error
libtextclassifier3::Status OptimizeInto(const std::string& new_directory);
+ // Calculates status for a potential Optimize call. Includes how many docs
+ // there are vs how many would be optimized away. And also includes an
+ // estimated size gains, in bytes, if Optimize were called.
+ //
+ // Returns:
+ // OptimizeInfo on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;
+
// Computes the combined checksum of the document store - includes the ground
// truth and all derived files.
//
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 5ec062f..f59d2e2 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -1966,5 +1966,53 @@ TEST_F(DocumentStoreTest,
IsOkAndHolds(EqualsProto(message_document)));
}
+TEST_F(DocumentStoreTest, GetOptimizeInfo) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Nothing should be optimizable yet
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentStore::OptimizeInfo optimize_info,
+ document_store->GetOptimizeInfo());
+ EXPECT_THAT(optimize_info.total_docs, Eq(0));
+ EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+
+ ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
+
+ // Adding a document, still nothing is optimizable
+ ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo());
+ EXPECT_THAT(optimize_info.total_docs, Eq(1));
+ EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+
+ // Delete a document. Now something is optimizable
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+ ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo());
+ EXPECT_THAT(optimize_info.total_docs, Eq(1));
+ EXPECT_THAT(optimize_info.optimizable_docs, Eq(1));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Gt(0));
+
+ // Optimize it into a different directory, should bring us back to nothing
+ // since all documents were optimized away.
+ std::string optimized_dir = document_store_dir_ + "_optimize";
+ EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir));
+ document_store.reset();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> optimized_document_store,
+ DocumentStore::Create(&filesystem_, optimized_dir, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(optimize_info,
+ optimized_document_store->GetOptimizeInfo());
+ EXPECT_THAT(optimize_info.total_docs, Eq(0));
+ EXPECT_THAT(optimize_info.optimizable_docs, Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h
index b01a8f1..a85b00d 100644
--- a/icing/store/key-mapper.h
+++ b/icing/store/key-mapper.h
@@ -99,13 +99,23 @@ class KeyMapper {
// INTERNAL on I/O error
libtextclassifier3::Status PersistToDisk();
- // Calculates and returns the disk usage in bytes.
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
//
// Returns:
// Disk usage on success
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+ // Returns the size of the elements held in the key mapper. This excludes the
+ // size of any internal metadata of the key mapper, e.g. the key mapper's
+ // header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
// Computes and returns the checksum of the header and contents.
Crc32 ComputeChecksum();
@@ -261,6 +271,16 @@ libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetDiskUsage() const {
}
template <typename T>
+libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetElementsSize() const {
+ int64_t size = trie_.GetElementsSize();
+ if (size == IcingFilesystem::kBadFileSize || size < 0) {
+ return absl_ports::InternalError(
+ "Failed to get disk usage of elements in the key mapper");
+ }
+ return size;
+}
+
+template <typename T>
Crc32 KeyMapper<T>::ComputeChecksum() {
return Crc32(trie_.UpdateCrc());
}
diff --git a/icing/testing/logging-event-listener.cc b/icing/testing/logging-event-listener.cc
new file mode 100644
index 0000000..4b42825
--- /dev/null
+++ b/icing/testing/logging-event-listener.cc
@@ -0,0 +1,121 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/logging-event-listener.h"
+
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+void LoggingEventListener::OnTestProgramStart(
+ const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestIterationStart(
+ const testing::UnitTest& unit_test, int iteration) {
+ ICING_LOG(INFO) << "[==========] Running " << unit_test.test_to_run_count()
+ << " test(s) from " << unit_test.test_case_to_run_count()
+ << " test case(s)";
+}
+
+void LoggingEventListener::OnEnvironmentsSetUpStart(
+ const testing::UnitTest& unit_test) {
+ ICING_LOG(INFO) << "[----------] Global test environment set-up.";
+}
+
+void LoggingEventListener::OnEnvironmentsSetUpEnd(
+ const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestCaseStart(const testing::TestCase& test_case) {
+ std::string param_text;
+ if (test_case.type_param()) {
+ param_text = IcingStringUtil::StringPrintf(", where TypeParam = %s",
+ test_case.type_param());
+ }
+ ICING_LOG(INFO) << "[----------] " << test_case.test_to_run_count()
+ << " test(s) from " << test_case.name() << param_text;
+}
+
+void LoggingEventListener::OnTestStart(const testing::TestInfo& test_info) {
+ ICING_LOG(INFO) << "[ RUN ] " << test_info.test_case_name() << "."
+ << test_info.name();
+}
+
+void LoggingEventListener::OnTestPartResult(
+ const testing::TestPartResult& test_part_result) {
+ if (test_part_result.type() != testing::TestPartResult::kSuccess) {
+ ICING_LOG(ERROR) << test_part_result.file_name() << ":"
+ << test_part_result.line_number() << ": Failure "
+ << test_part_result.message();
+ }
+}
+
+void LoggingEventListener::OnTestEnd(const testing::TestInfo& test_info) {
+ if (test_info.result()->Passed()) {
+ ICING_LOG(INFO) << "[ OK ] " << test_info.test_case_name() << "."
+ << test_info.name();
+ } else {
+ ICING_LOG(ERROR) << "[ FAILED ] " << test_info.test_case_name() << "."
+ << test_info.name();
+ }
+}
+
+void LoggingEventListener::OnTestCaseEnd(const testing::TestCase& test_case) {
+ ICING_LOG(INFO) << "[----------] " << test_case.test_to_run_count()
+ << " test(s) from " << test_case.name() << " ("
+ << test_case.elapsed_time() << " ms total)";
+}
+
+void LoggingEventListener::OnEnvironmentsTearDownStart(
+ const testing::UnitTest& unit_test) {
+ ICING_LOG(INFO) << "[----------] Global test environment tear-down.";
+}
+
+void LoggingEventListener::OnEnvironmentsTearDownEnd(
+ const testing::UnitTest& /* unit_test */) {}
+
+void LoggingEventListener::OnTestIterationEnd(
+ const testing::UnitTest& unit_test, int iteration) {
+ ICING_LOG(INFO) << "[==========] " << unit_test.test_to_run_count()
+ << " test(s) from " << unit_test.test_case_to_run_count()
+ << " test case(s) ran. (" << unit_test.elapsed_time()
+ << " ms total)";
+ ICING_LOG(INFO) << "[ PASSED ] " << unit_test.successful_test_count()
+ << " test(s)";
+ if (!unit_test.Passed()) {
+ ICING_LOG(ERROR) << "[ FAILED ] " << unit_test.failed_test_count()
+ << " test(s), listed below:";
+ for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+ const testing::TestCase& test_case = *unit_test.GetTestCase(i);
+ if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+ continue;
+ }
+ for (int j = 0; j < test_case.total_test_count(); ++j) {
+ const testing::TestInfo& test_info = *test_case.GetTestInfo(j);
+ if (!test_info.should_run() || test_info.result()->Passed()) {
+ continue;
+ }
+ ICING_LOG(ERROR) << "[ FAILED ] " << test_case.name() << "."
+ << test_info.name();
+ }
+ }
+ }
+}
+
+void LoggingEventListener::OnTestProgramEnd(
+ const testing::UnitTest& /* unit_test */) {}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/logging-event-listener.h b/icing/testing/logging-event-listener.h
new file mode 100644
index 0000000..8024222
--- /dev/null
+++ b/icing/testing/logging-event-listener.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_LOGGING_EVENT_LISTENER_H_
+#define ICING_TESTING_LOGGING_EVENT_LISTENER_H_
+
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+
+// TestEventListener that writes test results to the log so that they will be
+// visible in the logcat output in Sponge.
+// The formatting of the output is patterned after the output produced by the
+// standard PrettyUnitTestResultPrinter.
+class LoggingEventListener : public ::testing::TestEventListener {
+ public:
+ void OnTestProgramStart(const testing::UnitTest& unit_test) override;
+
+ void OnTestIterationStart(const testing::UnitTest& unit_test,
+ int iteration) override;
+
+ void OnEnvironmentsSetUpStart(const testing::UnitTest& unit_test) override;
+
+ void OnEnvironmentsSetUpEnd(const testing::UnitTest& unit_test) override;
+
+ void OnTestCaseStart(const testing::TestCase& test_case) override;
+
+ void OnTestStart(const testing::TestInfo& test_info) override;
+
+ void OnTestPartResult(
+ const testing::TestPartResult& test_part_result) override;
+
+ void OnTestEnd(const testing::TestInfo& test_info) override;
+
+ void OnTestCaseEnd(const testing::TestCase& test_case) override;
+
+ void OnEnvironmentsTearDownStart(const testing::UnitTest& unit_test) override;
+
+ void OnEnvironmentsTearDownEnd(const testing::UnitTest& unit_test) override;
+
+ void OnTestIterationEnd(const testing::UnitTest& unit_test,
+ int iteration) override;
+
+ void OnTestProgramEnd(const testing::UnitTest& unit_test) override;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_LOGGING_EVENT_LISTENER_H_
diff --git a/icing/text_classifier/lib3/utils/java/jni-base.cc b/icing/text_classifier/lib3/utils/java/jni-base.cc
new file mode 100644
index 0000000..3b6d09e
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-base.cc
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/java/string_utils.h"
+
+namespace libtextclassifier3 {
+
+bool EnsureLocalCapacity(JNIEnv* env, int capacity) {
+ return env->EnsureLocalCapacity(capacity) == JNI_OK;
+}
+
+bool JniExceptionCheckAndClear(JNIEnv* env) {
+ TC3_CHECK(env != nullptr);
+ const bool result = env->ExceptionCheck();
+ if (result) {
+ env->ExceptionDescribe();
+ env->ExceptionClear();
+ }
+ return result;
+}
+
+StatusOr<std::string> ToStlString(JNIEnv* env, const jstring& str) {
+ std::string result;
+ if (!JStringToUtf8String(env, str, &result)) {
+ return {Status::UNKNOWN};
+ }
+ return result;
+}
+
+} // namespace libtextclassifier3
diff --git a/icing/text_classifier/lib3/utils/java/jni-base.h b/icing/text_classifier/lib3/utils/java/jni-base.h
new file mode 100644
index 0000000..7fd612a
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-base.h
@@ -0,0 +1,217 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
+#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
+
+#include <jni.h>
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+// When we use a macro as an argument for a macro, an additional level of
+// indirection is needed, if the macro argument is used with # or ##.
+#define TC3_ADD_QUOTES_HELPER(TOKEN) #TOKEN
+#define TC3_ADD_QUOTES(TOKEN) TC3_ADD_QUOTES_HELPER(TOKEN)
+
+#ifndef TC3_PACKAGE_NAME
+#define TC3_PACKAGE_NAME com_google_knowledge_cerebra_sense_textclassifier_lib3
+#endif
+
+#ifndef TC3_PACKAGE_PATH
+#define TC3_PACKAGE_PATH \
+ "com/google/knowledge/cerebra/sense/textclassifier/lib3/"
+#endif
+
+#define TC3_JNI_METHOD_NAME_INTERNAL(package_name, class_name, method_name) \
+ Java_##package_name##_##class_name##_##method_name
+
+#define TC3_JNI_METHOD_PRIMITIVE(return_type, package_name, class_name, \
+ method_name) \
+ JNIEXPORT return_type JNICALL TC3_JNI_METHOD_NAME_INTERNAL( \
+ package_name, class_name, method_name)
+
+// The indirection is needed to correctly expand the TC3_PACKAGE_NAME macro.
+// See the explanation near TC3_ADD_QUOTES macro.
+#define TC3_JNI_METHOD2(return_type, package_name, class_name, method_name) \
+ TC3_JNI_METHOD_PRIMITIVE(return_type, package_name, class_name, method_name)
+
+#define TC3_JNI_METHOD(return_type, class_name, method_name) \
+ TC3_JNI_METHOD2(return_type, TC3_PACKAGE_NAME, class_name, method_name)
+
+#define TC3_JNI_METHOD_NAME2(package_name, class_name, method_name) \
+ TC3_JNI_METHOD_NAME_INTERNAL(package_name, class_name, method_name)
+
+#define TC3_JNI_METHOD_NAME(class_name, method_name) \
+ TC3_JNI_METHOD_NAME2(TC3_PACKAGE_NAME, class_name, method_name)
+
+namespace libtextclassifier3 {
+
+// Returns true if the requested capacity is available.
+bool EnsureLocalCapacity(JNIEnv* env, int capacity);
+
+// Returns true if there was an exception. Also it clears the exception.
+bool JniExceptionCheckAndClear(JNIEnv* env);
+
+StatusOr<std::string> ToStlString(JNIEnv* env, const jstring& str);
+
+// A deleter to be used with std::unique_ptr to delete JNI global references.
+class GlobalRefDeleter {
+ public:
+ explicit GlobalRefDeleter(JavaVM* jvm) : jvm_(jvm) {}
+
+ GlobalRefDeleter(const GlobalRefDeleter& orig) = default;
+
+ // Copy assignment to allow move semantics in ScopedGlobalRef.
+ GlobalRefDeleter& operator=(const GlobalRefDeleter& rhs) {
+ TC3_CHECK_EQ(jvm_, rhs.jvm_);
+ return *this;
+ }
+
+ // The delete operator.
+ void operator()(jobject object) const {
+ JNIEnv* env;
+ if (object != nullptr && jvm_ != nullptr &&
+ JNI_OK ==
+ jvm_->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_4)) {
+ env->DeleteGlobalRef(object);
+ }
+ }
+
+ private:
+ // The jvm_ stashed to use for deletion.
+ JavaVM* const jvm_;
+};
+
+// A deleter to be used with std::unique_ptr to delete JNI local references.
+class LocalRefDeleter {
+ public:
+ explicit LocalRefDeleter(JNIEnv* env)
+ : env_(env) {} // NOLINT(runtime/explicit)
+
+ LocalRefDeleter(const LocalRefDeleter& orig) = default;
+
+ // Copy assignment to allow move semantics in ScopedLocalRef.
+ LocalRefDeleter& operator=(const LocalRefDeleter& rhs) {
+ env_ = rhs.env_;
+ return *this;
+ }
+
+ // The delete operator.
+ void operator()(jobject object) const {
+ if (env_) {
+ env_->DeleteLocalRef(object);
+ }
+ }
+
+ private:
+ // The env_ stashed to use for deletion. Thread-local, don't share!
+ JNIEnv* env_;
+};
+
+// A smart pointer that deletes a reference when it goes out of scope.
+//
+// Note that this class is not thread-safe since it caches JNIEnv in
+// the deleter. Do not use the same jobject across different threads.
+template <typename T, typename Env, typename Deleter>
+class ScopedRef {
+ public:
+ ScopedRef() : ptr_(nullptr, Deleter(nullptr)) {}
+ ScopedRef(T value, Env* env) : ptr_(value, Deleter(env)) {}
+
+ T get() const { return ptr_.get(); }
+
+ T release() { return ptr_.release(); }
+
+ bool operator!() const { return !ptr_; }
+
+ bool operator==(void* value) const { return ptr_.get() == value; }
+
+ explicit operator bool() const { return ptr_ != nullptr; }
+
+ void reset(T value, Env* env) {
+ ptr_.reset(value);
+ ptr_.get_deleter() = Deleter(env);
+ }
+
+ private:
+ std::unique_ptr<typename std::remove_pointer<T>::type, Deleter> ptr_;
+};
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator==(const ScopedRef<T, Env, Deleter>& x,
+ const ScopedRef<U, Env, Deleter>& y) {
+ return x.get() == y.get();
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator==(const ScopedRef<T, Env, Deleter>& x, std::nullptr_t) {
+ return x.get() == nullptr;
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator==(std::nullptr_t, const ScopedRef<T, Env, Deleter>& x) {
+ return nullptr == x.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator!=(const ScopedRef<T, Env, Deleter>& x,
+ const ScopedRef<U, Env, Deleter>& y) {
+ return x.get() != y.get();
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator!=(const ScopedRef<T, Env, Deleter>& x, std::nullptr_t) {
+ return x.get() != nullptr;
+}
+
+template <typename T, typename Env, typename Deleter>
+inline bool operator!=(std::nullptr_t, const ScopedRef<T, Env, Deleter>& x) {
+ return nullptr != x.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator<(const ScopedRef<T, Env, Deleter>& x,
+ const ScopedRef<U, Env, Deleter>& y) {
+ return x.get() < y.get();
+}
+
+template <typename T, typename U, typename Env, typename Deleter>
+inline bool operator>(const ScopedRef<T, Env, Deleter>& x,
+ const ScopedRef<U, Env, Deleter>& y) {
+ return x.get() > y.get();
+}
+
+// A smart pointer that deletes a JNI global reference when it goes out
+// of scope. Usage is:
+// ScopedGlobalRef<jobject> scoped_global(env->JniFunction(), jvm);
+template <typename T>
+using ScopedGlobalRef = ScopedRef<T, JavaVM, GlobalRefDeleter>;
+
+// Ditto, but usage is:
+// ScopedLocalRef<jobject> scoped_local(env->JniFunction(), env);
+template <typename T>
+using ScopedLocalRef = ScopedRef<T, JNIEnv, LocalRefDeleter>;
+
+// A helper to create global references.
+template <typename T>
+ScopedGlobalRef<T> MakeGlobalRef(T object, JNIEnv* env, JavaVM* jvm) {
+ const jobject global_object = env->NewGlobalRef(object);
+ return ScopedGlobalRef<T>(reinterpret_cast<T>(global_object), jvm);
+}
+
+} // namespace libtextclassifier3
+
+#endif // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_
diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.cc b/icing/text_classifier/lib3/utils/java/jni-helper.cc
new file mode 100644
index 0000000..0d9b0a0
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-helper.cc
@@ -0,0 +1,175 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
+
+namespace libtextclassifier3 {
+
+StatusOr<ScopedLocalRef<jclass>> JniHelper::FindClass(JNIEnv* env,
+ const char* class_name) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jclass> result(env->FindClass(class_name), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<jmethodID> JniHelper::GetMethodID(JNIEnv* env, jclass clazz,
+ const char* method_name,
+ const char* return_type) {
+ jmethodID result = env->GetMethodID(clazz, method_name, return_type);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jobject>> JniHelper::GetStaticObjectField(
+ JNIEnv* env, jclass class_name, jfieldID field_id) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jobject> result(
+ env->GetStaticObjectField(class_name, field_id), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jbyteArray>> JniHelper::NewByteArray(JNIEnv* env,
+ jsize length) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jbyteArray> result(env->NewByteArray(length), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+Status JniHelper::CallVoidMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ env->CallVoidMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return Status::OK;
+}
+
+StatusOr<bool> JniHelper::CallBooleanMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ bool result = env->CallBooleanMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<int32> JniHelper::CallIntMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jint result = env->CallIntMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<int64> JniHelper::CallLongMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jlong result = env->CallLongMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<float> JniHelper::CallFloatMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jfloat result = env->CallFloatMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<double> JniHelper::CallDoubleMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jdouble result = env->CallDoubleMethodV(object, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jintArray>> JniHelper::NewIntArray(JNIEnv* env,
+ jsize length) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jintArray> result(env->NewIntArray(length), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jfloatArray>> JniHelper::NewFloatArray(JNIEnv* env,
+ jsize length) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jfloatArray> result(env->NewFloatArray(length), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+Status JniHelper::SetObjectArrayElement(JNIEnv* env, jobjectArray array,
+ jsize index, jobject val) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ env->SetObjectArrayElement(array, index, val);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return Status::OK;
+}
+
+StatusOr<ScopedLocalRef<jobjectArray>> JniHelper::NewObjectArray(
+ JNIEnv* env, jsize length, jclass element_class, jobject initial_element) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jobjectArray> result(
+ env->NewObjectArray(length, element_class, initial_element), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+StatusOr<jsize> JniHelper::GetArrayLength(JNIEnv* env,
+ jarray jinput_fragments) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ jsize result = env->GetArrayLength(jinput_fragments);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+StatusOr<ScopedLocalRef<jstring>> JniHelper::NewStringUTF(JNIEnv* env,
+ const char* bytes) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<jstring> result(env->NewStringUTF(bytes), env);
+ TC3_NO_EXCEPTION_OR_RETURN;
+ TC3_NOT_NULL_OR_RETURN;
+ return result;
+}
+
+} // namespace libtextclassifier3
diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.h b/icing/text_classifier/lib3/utils/java/jni-helper.h
new file mode 100644
index 0000000..ea4ba3b
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/jni-helper.h
@@ -0,0 +1,156 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Utility class that provides similar calls like JNIEnv, but performs
+// additional checks on them, so that it's harder to use them incorrectly.
+
+#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
+#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
+
+#include <jni.h>
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+
+#define TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN \
+ if (!EnsureLocalCapacity(env, 1)) { \
+ TC3_LOG(ERROR) << "EnsureLocalCapacity(1) failed."; \
+ return {Status::UNKNOWN}; \
+ }
+
+#define TC3_NO_EXCEPTION_OR_RETURN \
+ if (JniExceptionCheckAndClear(env)) { \
+ return {Status::UNKNOWN}; \
+ }
+
+#define TC3_NOT_NULL_OR_RETURN \
+ if (result == nullptr) { \
+ return {Status::UNKNOWN}; \
+ }
+
+#define TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD( \
+ METHOD_NAME, RETURN_TYPE, INPUT_TYPE, POST_CHECK) \
+ template <typename T = RETURN_TYPE> \
+ static StatusOr<ScopedLocalRef<T>> METHOD_NAME( \
+ JNIEnv* env, INPUT_TYPE object, jmethodID method_id, ...) { \
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; \
+ \
+ va_list args; \
+ va_start(args, method_id); \
+ ScopedLocalRef<T> result( \
+ reinterpret_cast<T>(env->METHOD_NAME##V(object, method_id, args)), \
+ env); \
+ POST_CHECK \
+ va_end(args); \
+ \
+ TC3_NO_EXCEPTION_OR_RETURN; \
+ return result; \
+ }
+
+#define TC3_JNI_NO_CHECK \
+ {}
+
+namespace libtextclassifier3 {
+
+class JniHelper {
+ public:
+ // Misc methods.
+ static StatusOr<ScopedLocalRef<jclass>> FindClass(JNIEnv* env,
+ const char* class_name);
+
+ template <typename T = jobject>
+ static StatusOr<ScopedLocalRef<T>> GetObjectArrayElement(JNIEnv* env,
+ jobjectArray array,
+ jsize index);
+ static StatusOr<jmethodID> GetMethodID(JNIEnv* env, jclass clazz,
+ const char* method_name,
+ const char* return_type);
+
+ static StatusOr<ScopedLocalRef<jobject>> GetStaticObjectField(
+ JNIEnv* env, jclass class_name, jfieldID field_id);
+
+ // New* methods.
+ TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(NewObject, jobject, jclass,
+ TC3_NOT_NULL_OR_RETURN);
+ static StatusOr<ScopedLocalRef<jobjectArray>> NewObjectArray(
+ JNIEnv* env, jsize length, jclass element_class,
+ jobject initial_element = nullptr);
+ static StatusOr<ScopedLocalRef<jbyteArray>> NewByteArray(JNIEnv* env,
+ jsize length);
+ static StatusOr<ScopedLocalRef<jintArray>> NewIntArray(JNIEnv* env,
+ jsize length);
+ static StatusOr<ScopedLocalRef<jstring>> NewStringUTF(JNIEnv* env,
+ const char* bytes);
+ static StatusOr<ScopedLocalRef<jfloatArray>> NewFloatArray(JNIEnv* env,
+ jsize length);
+
+ static StatusOr<jsize> GetArrayLength(JNIEnv* env, jarray jinput_fragments);
+
+ static Status SetObjectArrayElement(JNIEnv* env, jobjectArray array,
+ jsize index, jobject val);
+
+ // Call* methods.
+ TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(CallObjectMethod, jobject,
+ jobject, TC3_JNI_NO_CHECK);
+ TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(CallStaticObjectMethod,
+ jobject, jclass,
+ TC3_JNI_NO_CHECK);
+ static Status CallVoidMethod(JNIEnv* env, jobject object, jmethodID method_id,
+ ...);
+ static StatusOr<bool> CallBooleanMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+ static StatusOr<int32> CallIntMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+ static StatusOr<int64> CallLongMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+ static StatusOr<float> CallFloatMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+ static StatusOr<double> CallDoubleMethod(JNIEnv* env, jobject object,
+ jmethodID method_id, ...);
+
+ template <class T>
+ static StatusOr<T> CallStaticIntMethod(JNIEnv* env, jclass clazz,
+ jmethodID method_id, ...);
+};
+
+template <typename T>
+StatusOr<ScopedLocalRef<T>> JniHelper::GetObjectArrayElement(JNIEnv* env,
+ jobjectArray array,
+ jsize index) {
+ TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN;
+ ScopedLocalRef<T> result(
+ reinterpret_cast<T>(env->GetObjectArrayElement(array, index)), env);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+template <class T>
+StatusOr<T> JniHelper::CallStaticIntMethod(JNIEnv* env, jclass clazz,
+ jmethodID method_id, ...) {
+ va_list args;
+ va_start(args, method_id);
+ jint result = env->CallStaticIntMethodV(clazz, method_id, args);
+ va_end(args);
+
+ TC3_NO_EXCEPTION_OR_RETURN;
+ return result;
+}
+
+} // namespace libtextclassifier3
+
+#endif // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_
diff --git a/icing/text_classifier/lib3/utils/java/string_utils.cc b/icing/text_classifier/lib3/utils/java/string_utils.cc
new file mode 100644
index 0000000..2384ba4
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/string_utils.cc
@@ -0,0 +1,73 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/text_classifier/lib3/utils/java/string_utils.h"
+
+#include "icing/text_classifier/lib3/utils/base/logging.h"
+
+namespace libtextclassifier3 {
+
+bool JByteArrayToString(JNIEnv* env, const jbyteArray& array,
+ std::string* result) {
+ jbyte* const array_bytes = env->GetByteArrayElements(array, JNI_FALSE);
+ if (array_bytes == nullptr) {
+ return false;
+ }
+
+ const int array_length = env->GetArrayLength(array);
+ *result = std::string(reinterpret_cast<char*>(array_bytes), array_length);
+
+ env->ReleaseByteArrayElements(array, array_bytes, JNI_ABORT);
+
+ return true;
+}
+
+bool JStringToUtf8String(JNIEnv* env, const jstring& jstr,
+ std::string* result) {
+ if (jstr == nullptr) {
+ *result = std::string();
+ return true;
+ }
+
+ jclass string_class = env->FindClass("java/lang/String");
+ if (!string_class) {
+ TC3_LOG(ERROR) << "Can't find String class";
+ return false;
+ }
+
+ jmethodID get_bytes_id =
+ env->GetMethodID(string_class, "getBytes", "(Ljava/lang/String;)[B");
+
+ jstring encoding = env->NewStringUTF("UTF-8");
+
+ jbyteArray array = reinterpret_cast<jbyteArray>(
+ env->CallObjectMethod(jstr, get_bytes_id, encoding));
+
+ JByteArrayToString(env, array, result);
+
+ // Release the array.
+ env->DeleteLocalRef(array);
+ env->DeleteLocalRef(string_class);
+ env->DeleteLocalRef(encoding);
+
+ return true;
+}
+
+ScopedStringChars GetScopedStringChars(JNIEnv* env, jstring string,
+ jboolean* is_copy) {
+ return ScopedStringChars(env->GetStringUTFChars(string, is_copy),
+ StringCharsReleaser(env, string));
+}
+
+} // namespace libtextclassifier3
diff --git a/icing/text_classifier/lib3/utils/java/string_utils.h b/icing/text_classifier/lib3/utils/java/string_utils.h
new file mode 100644
index 0000000..dddef57
--- /dev/null
+++ b/icing/text_classifier/lib3/utils/java/string_utils.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_
+#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_
+
+#include <jni.h>
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/logging.h"
+
+namespace libtextclassifier3 {
+
+bool JByteArrayToString(JNIEnv* env, const jbyteArray& array,
+ std::string* result);
+bool JStringToUtf8String(JNIEnv* env, const jstring& jstr, std::string* result);
+
+// A deleter to be used with std::unique_ptr to release Java string chars.
+class StringCharsReleaser {
+ public:
+ StringCharsReleaser() : env_(nullptr) {}
+
+ StringCharsReleaser(JNIEnv* env, jstring jstr) : env_(env), jstr_(jstr) {}
+
+ StringCharsReleaser(const StringCharsReleaser& orig) = default;
+
+ // Copy assignment to allow move semantics in StringCharsReleaser.
+ StringCharsReleaser& operator=(const StringCharsReleaser& rhs) {
+ // As the releaser and its state are thread-local, it's enough to only
+ // ensure the envs are consistent but do nothing.
+ TC3_CHECK_EQ(env_, rhs.env_);
+ return *this;
+ }
+
+ // The delete operator.
+ void operator()(const char* chars) const {
+ if (env_ != nullptr) {
+ env_->ReleaseStringUTFChars(jstr_, chars);
+ }
+ }
+
+ private:
+ // The env_ stashed to use for deletion. Thread-local, don't share!
+ JNIEnv* const env_;
+
+ // The referenced jstring.
+ jstring jstr_;
+};
+
+// A smart pointer that releases string chars when it goes out of scope.
+// of scope.
+// Note that this class is not thread-safe since it caches JNIEnv in
+// the deleter. Do not use the same jobject across different threads.
+using ScopedStringChars = std::unique_ptr<const char, StringCharsReleaser>;
+
+// Returns a scoped pointer to the array of Unicode characters of a string.
+ScopedStringChars GetScopedStringChars(JNIEnv* env, jstring string,
+ jboolean* is_copy = nullptr);
+
+} // namespace libtextclassifier3
+
+#endif // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_
diff --git a/icing/tokenization/icu-language-segmenter_test.cc b/icing/tokenization/icu-language-segmenter_test.cc
deleted file mode 100644
index fd4755a..0000000
--- a/icing/tokenization/icu-language-segmenter_test.cc
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/icu-data-file-helper.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-i18n-test-utils.h"
-#include "icing/testing/test-data.h"
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/language-segmenter.h"
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-namespace {
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-class IcuLanguageSegmenterAllLocalesTest
- : public testing::TestWithParam<const char*> {
- protected:
- void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
- }
-
- static std::string GetLocale() { return GetParam(); }
-};
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
- IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- // ASCII punctuation marks are kept
- EXPECT_THAT(
- language_segmenter->GetAllTerms("Hello, World!!!"),
- IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
- EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
- IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
- EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
- IsOkAndHolds(ElementsAre("100", "%")));
- EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
- IsOkAndHolds(ElementsAre("A", "&", "B")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- // ASCII special characters are kept
- EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
- IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
- EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
- IsOkAndHolds(ElementsAre("A", "+", "B")));
- // 0x0009 is the unicode for tab (within ASCII range).
- std::string text_with_tab = absl_ports::StrCat(
- "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
- EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
- IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
- UCharToString(0x0009), "World")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- // Full-width (non-ASCII) punctuation marks and special characters are left
- // out.
- EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
- IsOkAndHolds(ElementsAre("Hello")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
- IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
- EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
- IsOkAndHolds(ElementsAre("I.B.M", ".")));
- EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
- IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
- EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
- IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- // According to unicode word break rules
- // WB6(https://unicode.org/reports/tr29/#WB6),
- // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
- // punctuation characters are used as word connecters. That is, words don't
- // break before and after them. Here we just test some that we care about.
-
- // Word connecters
- EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
- IsOkAndHolds(ElementsAre("com.google.android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
- IsOkAndHolds(ElementsAre("com:google:android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
- IsOkAndHolds(ElementsAre("com'google'android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
- IsOkAndHolds(ElementsAre("com_google_android")));
-
- // Word connecters can be mixed
- EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
- IsOkAndHolds(ElementsAre("com.google.android:icing")));
-
- // Any heading and trailing characters are not connecters
- EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
- IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
-
- // Not word connecters
- EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
- IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
- IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
- IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
- IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
- IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
- IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
- IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
- IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
- IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
- IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
- EXPECT_THAT(
- language_segmenter->GetAllTerms("com\"google\"android"),
- IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
- IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
- EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
- IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
- EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
- IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
- EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
- IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
- // 0x2019 is the single right quote, should be treated the same as "'"
- std::string token_with_quote =
- absl_ports::StrCat("He", UCharToString(0x2019), "ll");
- std::string text_with_quote =
- absl_ports::StrCat(token_with_quote, " be back.");
- EXPECT_THAT(
- language_segmenter->GetAllTerms(text_with_quote),
- IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
- IsOkAndHolds(ElementsAre("(", "Hello", ")")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
- IsOkAndHolds(ElementsAre(")", "Hello", "(")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
- IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
- IsOkAndHolds(ElementsAre("'", "Hello", "'")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
-
- // Alphanumeric terms are allowed
- EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
- IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
-
- // Alphanumeric terms are allowed
- EXPECT_THAT(
- language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
- IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
- IsOkAndHolds(ElementsAre("3,456.789")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
- IsOkAndHolds(ElementsAre("-", "123")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- // Multiple continuous whitespaces are treated as one.
- const int kNumSeparators = 256;
- const std::string text_with_spaces =
- absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
- EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
- IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
- // have whitespaces as word delimiter.
-
- // Chinese
- EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
- // Japanese
- EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
- "い", "てい", "ます")));
- // Khmer
- EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
- // Thai
- EXPECT_THAT(
- language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
- IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
- IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
-}
-
-// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
-TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- // Turkish
- EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
- IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
- // Korean
- EXPECT_THAT(
- language_segmenter->GetAllTerms("나는 매일 출근합니다."),
- IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
-}
-
-// TODO(samzheng): more mixed languages test cases
-TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
- IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
- "吗", "お", "元気", "です", "か")));
-
- EXPECT_THAT(
- language_segmenter->GetAllTerms("나는 California에 산다"),
- IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
-}
-
-TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C,
- GetLocale()));
- // Validates that the input strings are not copied
- const std::string text = "Hello World";
- const char* word1_address = text.c_str();
- const char* word2_address = text.c_str() + 6;
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
- language_segmenter->GetAllTerms(text));
- ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
- const char* word1_result_address = terms.at(0).data();
- const char* word2_result_address = terms.at(2).data();
-
- // The underlying char* should be the same
- EXPECT_THAT(word1_address, Eq(word1_result_address));
- EXPECT_THAT(word2_address, Eq(word2_result_address));
-}
-
-INSTANTIATE_TEST_SUITE_P(
- LocaleName, IcuLanguageSegmenterAllLocalesTest,
- testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
- ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
- ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
- ULOC_TRADITIONAL_CHINESE,
- "es_ES", // Spanish
- "hi_IN", // Hindi
- "th_TH", // Thai
- "lo_LA", // Lao
- "km_KH", // Khmer
- "ar_DZ", // Arabic
- "ru_RU", // Russian
- "pt_PT", // Portuguese
- "en_US_POSIX" // American English (Computer)
- "wrong_locale" // Will fall back to ICU default locale
- "" // Will fall back to ICU default locale
- ));
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc
index 92d06fe..0ef1824 100644
--- a/icing/tokenization/language-segmenter-factory.cc
+++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc
@@ -12,10 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include "icing/tokenization/icu/icu-language-segmenter.h"
#include "icing/tokenization/language-segmenter-factory.h"
-
-#include "icing/tokenization/icu-language-segmenter.h"
-#include "icing/tokenization/space-language-segmenter.h"
#include "icing/util/logging.h"
namespace icing {
@@ -37,23 +35,18 @@ constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
// users. Right now illegal locale strings will be ignored by ICU. ICU
// components will be created with its default locale.
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
- SegmenterType type, std::string locale) {
+ SegmenterOptions options) {
// Word connector rules for "en_US_POSIX" (American English (Computer)) are
// different from other locales. E.g. "email.subject" will be split into 3
// terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
// term in other locales. Our current LanguageSegmenter doesn't handle this
// special rule, so we replace it with "en_US".
- if (locale == kLocaleAmericanEnglishComputer) {
+ if (options.locale == kLocaleAmericanEnglishComputer) {
ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
<< " not supported. Converting to locale " << ULOC_US;
- locale = ULOC_US;
- }
- switch (type) {
- case ICU4C:
- return std::make_unique<IcuLanguageSegmenter>(std::move(locale));
- case SPACE:
- return std::make_unique<SpaceLanguageSegmenter>();
+ options.locale = ULOC_US;
}
+ return std::make_unique<IcuLanguageSegmenter>(std::move(options.locale));
}
} // namespace language_segmenter_factory
diff --git a/icing/tokenization/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 8d6aa76..d43a78d 100644
--- a/icing/tokenization/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/tokenization/icu-language-segmenter.h"
+#include "icing/tokenization/icu/icu-language-segmenter.h"
#include <cstdint>
#include <memory>
@@ -24,7 +24,8 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
#include "unicode/ubrk.h"
#include "unicode/uchar.h"
@@ -61,7 +62,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
}
// Advances to the next term. Returns false if it has reached the end.
- bool Advance() {
+ bool Advance() override {
// Prerequisite check
if (term_end_index_exclusive_ == UBRK_DONE) {
return false;
@@ -77,52 +78,66 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Reached the end
if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
return false;
}
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (icu_i18n_utils::IsAscii(text_[term_start_index_])) {
- return true;
- }
-
- UChar32 uchar32 = icu_i18n_utils::GetUChar32At(text_.data(), text_.length(),
- term_start_index_);
- // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
- // We know it's an alphabetic term by checking the first unicode character.
- if (u_isUAlphabetic(uchar32)) {
- return true;
- } else {
+ if (!IsValidSegment()) {
return Advance();
}
+ return true;
}
// Returns the current term. It can be called only when Advance() returns
// true.
- std::string_view GetTerm() const {
- if (text_[term_start_index_] == kASCIISpace) {
+ std::string_view GetTerm() const override {
+ int term_length = term_end_index_exclusive_ - term_start_index_;
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ term_length = 0;
+ } else if (text_[term_start_index_] == kASCIISpace) {
// Rule 3: multiple continuous whitespaces are treated as one.
- return std::string_view(&text_[term_start_index_], 1);
+ term_length = 1;
}
- return text_.substr(term_start_index_,
- term_end_index_exclusive_ - term_start_index_);
+ return text_.substr(term_start_index_, term_length);
}
libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
- int32_t offset) {
+ int32_t offset) override {
+ if (offset < 0 || offset >= text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset %d is not within bounds of string "
+ "of length %zu",
+ offset, text_.length()));
+ }
term_start_index_ = ubrk_following(break_iterator_, offset);
if (term_start_index_ == UBRK_DONE) {
- return absl_ports::NotFoundError("");
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
}
term_end_index_exclusive_ = ubrk_next(break_iterator_);
if (term_end_index_exclusive_ == UBRK_DONE) {
- return absl_ports::NotFoundError("");
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ if (!IsValidSegment()) {
+ if (!Advance()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
}
return term_start_index_;
}
libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
- int32_t offset) {
+ int32_t offset) override {
+ if (offset < 0 || offset >= text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset %d is not within bounds of string "
+ "of length %zu",
+ offset, text_.length()));
+ }
ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset));
if (term_end_index_exclusive_ > offset) {
// This term ends after offset. So we need to get the term just before
@@ -132,6 +147,15 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return term_start_index_;
}
+ libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ term_start_index_ = 0;
+ term_end_index_exclusive_ = 0;
+ if (!Advance()) {
+ return absl_ports::NotFoundError("");
+ }
+ return term_start_index_;
+ }
+
private:
explicit IcuLanguageSegmenterIterator(std::string_view text,
std::string_view locale)
@@ -155,15 +179,43 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset) {
term_start_index_ = ubrk_preceding(break_iterator_, offset);
if (term_start_index_ == UBRK_DONE) {
+ MarkAsDone();
return absl_ports::NotFoundError("");
}
term_end_index_exclusive_ = ubrk_next(break_iterator_);
if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
return absl_ports::NotFoundError("");
}
return libtextclassifier3::Status::OK;
}
+ // Ensures that all members are consistent with the 'Done' state.
+ // In the 'Done' state, term_start_index_ will point to the first character
+ // and term_end_index_exclusive_ will be marked with the kDone value.
+ // break_iterator_ may be in any state.
+ void MarkAsDone() {
+ term_end_index_exclusive_ = UBRK_DONE;
+ term_start_index_ = 0;
+ }
+
+ bool IsValidSegment() const {
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[term_start_index_])) {
+ return true;
+ }
+
+ UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
+ term_start_index_);
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (u_isUAlphabetic(uchar32)) {
+ return true;
+ }
+ return false;
+ }
+
// The underlying class that does the segmentation, ubrk_close() must be
// called after using.
UBreakIterator* break_iterator_;
diff --git a/icing/tokenization/icu-language-segmenter.h b/icing/tokenization/icu/icu-language-segmenter.h
index b3d1acc..4115461 100644
--- a/icing/tokenization/icu-language-segmenter.h
+++ b/icing/tokenization/icu/icu-language-segmenter.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_
-#define ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_
+#ifndef ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
#include <cstdint>
#include <memory>
@@ -76,4 +76,4 @@ class IcuLanguageSegmenter : public LanguageSegmenter {
} // namespace lib
} // namespace icing
-#endif // ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_
+#endif // ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
new file mode 100644
index 0000000..31c2726
--- /dev/null
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -0,0 +1,1016 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+// Returns a vector containing all terms retrieved by Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsAdvance(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling
+// ResetToStart/ResetAfter with the current position to simulate Advancing on
+// the iterator.
+std::vector<std::string_view> GetAllTermsResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ if (!itr->ResetToStart().ok()) {
+ return terms;
+ }
+ terms.push_back(itr->GetTerm());
+ const char* text_begin = itr->GetTerm().data();
+ // Calling ResetToTermStartingAfter with the current position should get the
+ // very next term in the sequence.
+ for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by alternating calls to
+// Advance and calls to ResetAfter with the current position to simulate
+// Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ std::vector<std::string_view> terms;
+
+ bool is_ok = true;
+ int current_pos = 0;
+ while (is_ok) {
+ // Alternate between using Advance and ResetToTermAfter.
+ if (terms.size() % 2 == 0) {
+ is_ok = itr->Advance();
+ } else {
+ // Calling ResetToTermStartingAfter with the current position should get
+ // the very next term in the sequence.
+ current_pos = itr->GetTerm().data() - text_begin;
+ is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
+ }
+ if (is_ok) {
+ terms.push_back(itr->GetTerm());
+ }
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetBefore with
+// the current position, starting at the end of the text. This vector should be
+// in reverse order of GetAllTerms and missing the last term.
+std::vector<std::string_view> GetAllTermsResetBefore(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ int last_pos = 0;
+ while (itr->Advance()) {
+ last_pos = itr->GetTerm().data() - text_begin;
+ }
+ std::vector<std::string_view> terms;
+ // Calling ResetToTermEndingBefore with the current position should get the
+ // previous term in the sequence.
+ for (int current_pos = last_pos;
+ itr->ResetToTermEndingBefore(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+class IcuLanguageSegmenterAllLocalesTest
+ : public testing::TestWithParam<const char*> {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ static std::string GetLocale() { return GetParam(); }
+ static language_segmenter_factory::SegmenterOptions GetOptions() {
+ return language_segmenter_factory::SegmenterOptions(GetLocale());
+ }
+};
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // ASCII punctuation marks are kept
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("Hello, World!!!"),
+ IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+ IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+ IsOkAndHolds(ElementsAre("100", "%")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+ IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // ASCII special characters are kept
+ EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+ IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+ IsOkAndHolds(ElementsAre("A", "+", "B")));
+ // 0x0009 is the unicode for tab (within ASCII range).
+ std::string text_with_tab = absl_ports::StrCat(
+ "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+ IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
+ UCharToString(0x0009), "World")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Full-width (non-ASCII) punctuation marks and special characters are left
+ // out.
+ EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
+ IsOkAndHolds(ElementsAre("Hello")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
+ IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+ IsOkAndHolds(ElementsAre("I.B.M", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+ IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+ IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // According to unicode word break rules
+ // WB6(https://unicode.org/reports/tr29/#WB6),
+ // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+ // punctuation characters are used as word connecters. That is, words don't
+ // break before and after them. Here we just test some that we care about.
+
+ // Word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+ IsOkAndHolds(ElementsAre("com.google.android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com:google:android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+ IsOkAndHolds(ElementsAre("com'google'android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+ IsOkAndHolds(ElementsAre("com_google_android")));
+
+ // Word connecters can be mixed
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+ IsOkAndHolds(ElementsAre("com.google.android:icing")));
+
+ // Any heading and trailing characters are not connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+ IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+ // Not word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+ IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+ IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+ IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+ IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+ IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+ IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+ IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+ IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+ IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com\"google\"android"),
+ IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+ IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+ IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+ IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+ IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+ // 0x2019 is the single right quote, should be treated the same as "'"
+ std::string token_with_quote =
+ absl_ports::StrCat("He", UCharToString(0x2019), "ll");
+ std::string text_with_quote =
+ absl_ports::StrCat(token_with_quote, " be back.");
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms(text_with_quote),
+ IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+ IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+ IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+ IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+ IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+ IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+ IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+ IsOkAndHolds(ElementsAre("3,456.789")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+ IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ const int kNumSeparators = 256;
+ std::string text_with_spaces =
+ absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+
+ // Multiple continuous whitespaces are treated as one. Whitespace at the
+ // beginning of the text doesn't affect the results of GetTerm() after the
+ // iterator is done.
+ text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
+ "Hello", " ", "World");
+ ICING_ASSERT_OK_AND_ASSIGN(auto itr,
+ language_segmenter->Segment(text_with_spaces));
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World"));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+ // have whitespaces as word delimiter.
+
+ // Chinese
+ EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+ // Japanese
+ EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
+ "い", "てい", "ます")));
+ // Khmer
+ EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+ // Thai
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+ IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Turkish
+ EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
+ IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
+ // Korean
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
+ IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
+ "吗", "お", "元気", "です", "か")));
+
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 California에 산다"),
+ IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Validates that the input strings are not copied
+ const std::string text = "Hello World";
+ const char* word1_address = text.c_str();
+ const char* word2_address = text.c_str() + 6;
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(text));
+ ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+ const char* word1_result_address = terms.at(0).data();
+ const char* word2_result_address = terms.at(2).data();
+
+ // The underlying char* should be the same
+ EXPECT_THAT(word1_address, Eq(word1_result_address));
+ EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ ASSERT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+}
+
+// Tests that ResetToTermAfter and Advance produce the same output. With the
+// exception of the first term which is inacessible via ResetToTermAfter,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermAfter calls with the current position
+// provided as the argument.
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as
+// ResetToTermAfter(current_position) can be used to simulate Advance, users
+// should be able to mix ResetToTermAfter(current_position) calls and Advance
+// calls to mimic calling Advance.
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->GetTerm(), Eq("你好"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->GetTerm(), Eq("吗"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ContinuousWhitespacesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // Bytes: 0 3 9 15 18
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("每天"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("走路"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+ EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42 51
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ ASSERT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+}
+
+// Tests that ResetToTermBefore and Advance produce the same output. With the
+// exception of the last term which is inacessible via ResetToTermBefore,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermBefore calls with the current position
+// provided as the argument (after their order has been reversed).
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty());
+ EXPECT_THAT(advance_itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ThaiResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ KoreanResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("元気"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->GetTerm(), Eq("です"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ContinuousWhitespacesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // Bytes: 0 3 9 15 18
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("我"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("去"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->GetTerm(), Eq("てい"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42 51
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ LocaleName, IcuLanguageSegmenterAllLocalesTest,
+ testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
+ ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
+ ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
+ ULOC_TRADITIONAL_CHINESE,
+ "es_ES", // Spanish
+ "hi_IN", // Hindi
+ "th_TH", // Thai
+ "lo_LA", // Lao
+ "km_KH", // Khmer
+ "ar_DZ", // Arabic
+ "ru_RU", // Russian
+ "pt_PT", // Portuguese
+ "en_US_POSIX" // American English (Computer)
+ "wrong_locale" // Will fall back to ICU default locale
+ "" // Will fall back to ICU default locale
+ ));
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index 244bcd8..5a4047b 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -18,19 +18,24 @@
#include <memory>
#include <string_view>
+#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace language_segmenter_factory {
-enum SegmenterType {
- ICU4C, // Uses the ICU library to segment text.
- SPACE, // Segments only on whitespace. Currently not used in production; used
- // to compile in Jetpack
+struct SegmenterOptions {
+ explicit SegmenterOptions(std::string locale = ULOC_US,
+ const JniCache* jni_cache = nullptr)
+ : locale(std::move(locale)), jni_cache(jni_cache) {}
+
+ std::string locale;
+ const JniCache* jni_cache;
};
// Creates a language segmenter with the given locale.
@@ -39,7 +44,7 @@ enum SegmenterType {
// A LanguageSegmenter on success
// INVALID_ARGUMENT if locale string is invalid
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
- SegmenterType type, std::string locale = ULOC_US);
+ SegmenterOptions options = SegmenterOptions());
} // namespace language_segmenter_factory
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index 6af44e1..c7b068d 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -15,7 +15,7 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -32,8 +32,7 @@ using ::testing::Eq;
// don't need to stress test the implementation's definition of a term. These
// test that it advances and traverses through simple terms consistently between
// all the implementations.
-class LanguageSegmenterIteratorTest
- : public testing::TestWithParam<language_segmenter_factory::SegmenterType> {
+class LanguageSegmenterIteratorTest : public testing::Test {
protected:
void SetUp() override {
ICING_ASSERT_OK(
@@ -41,15 +40,11 @@ class LanguageSegmenterIteratorTest
icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
}
-
- static language_segmenter_factory::SegmenterType GetType() {
- return GetParam();
- }
};
-TEST_P(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
+TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -65,10 +60,10 @@ TEST_P(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
EXPECT_FALSE(iterator->Advance());
}
-TEST_P(LanguageSegmenterIteratorTest,
+TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithOffsetInText) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -80,45 +75,48 @@ TEST_P(LanguageSegmenterIteratorTest,
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_P(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithNegativeOffsetOk) {
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermStartingAfterWithNegativeOffsetNotOk) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-1),
- IsOkAndHolds(0)); // The term "foo"
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-100),
- IsOkAndHolds(0)); // The term "foo"
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(iterator->ResetToStart(), IsOkAndHolds(0));
+ EXPECT_THAT(iterator->GetTerm(), Eq("foo"));
}
-TEST_P(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithTextLengthOffsetNotFound) {
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
std::string text = "foo bar";
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_P(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithOffsetPastTextLengthNotFound) {
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
std::string text = "foo bar";
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_P(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
+TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -130,50 +128,46 @@ TEST_P(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_P(LanguageSegmenterIteratorTest,
- ResetToTermEndingBeforeWithZeroOrNegativeOffsetNotFound) {
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeWithZeroNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
+ // Zero is a valid argument, but there aren't any terms that end before it.
EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
+ language_segmenter->Segment("foo bar"));
EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-1),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-100),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_P(LanguageSegmenterIteratorTest,
- ResetToTermEndingBeforeWithTextLengthOffsetOk) {
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
std::string text = "foo bar";
ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),
- IsOkAndHolds(4)); // The term "bar"
-}
-
-TEST_P(LanguageSegmenterIteratorTest,
- ResetToTermEndingBeforeWithOffsetPastTextLengthNotFound) {
- std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetType()));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length() + 1),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-INSTANTIATE_TEST_SUITE_P(
- SegmenterType, LanguageSegmenterIteratorTest,
- testing::Values(language_segmenter_factory::SegmenterType::ICU4C,
- language_segmenter_factory::SegmenterType::SPACE));
-
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h
index fde9ae2..fdb1846 100644
--- a/icing/tokenization/language-segmenter.h
+++ b/icing/tokenization/language-segmenter.h
@@ -64,16 +64,18 @@ class LanguageSegmenter {
// iterator.ResetToTermStartingAfter(4);
// iterator.GetTerm() // returns "baz";
//
- // Passing in a negative offset will return the offset of the first term.
- //
- // Passing in an offset that is equal to or exceeds the underlying text
- // length will return NOT_FOUND.
+ // Return types of OK and NOT_FOUND indicate that the function call was
+ // valid and the state of the iterator has changed. Return type of
+ // INVALID_ARGUMENT will leave the iterator unchanged.
//
// Returns:
// On success, the starting position of the first term that starts after
// offset.
// NOT_FOUND if an error occurred or there are no terms that start after
// offset.
+ // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
int32_t offset) = 0;
@@ -85,21 +87,22 @@ class LanguageSegmenter {
// iterator.ResetToTermEndingBefore(7);
// iterator.GetTerm() // returns "bar";
//
- // Passing in an offset equal to or less than 0 will return NOT_FOUND.
- //
- // Passing in an offset equal to the underlying text length will return the
- // offset of the last term.
- //
- // Passing in an offset that is greater than the underlying text length will
- // return NOT_FOUND.
+ // Return types of OK and NOT_FOUND indicate that the function call was
+ // valid and the state of the iterator has changed. Return type of
+ // INVALID_ARGUMENT will leave the iterator unchanged.
//
// Returns:
// On success, the starting position of the first term that ends before
// offset.
// NOT_FOUND if an error occurred or there are no terms that ends before
// offset.
+ // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
int32_t offset) = 0;
+
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToStart() = 0;
};
// Segments the input text into terms.
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index 01cc938..49ddfca 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -14,7 +14,7 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -60,8 +60,7 @@ void BM_SegmentNoSpace(benchmark::State& state) {
}
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::string input_string(state.range(0), 'A');
@@ -97,8 +96,7 @@ void BM_SegmentWithSpaces(benchmark::State& state) {
}
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::string input_string(state.range(0), 'A');
for (int i = 1; i < input_string.length(); i += 2) {
@@ -137,8 +135,7 @@ void BM_SegmentCJK(benchmark::State& state) {
}
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C)
- .ValueOrDie();
+ language_segmenter_factory::Create().ValueOrDie();
std::string input_string;
while (input_string.length() < state.range(0)) {
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index 556a095..6e54af9 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -18,7 +18,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -39,8 +39,8 @@ bool IsValidTerm(std::string_view term) {
}
// Gets the first unicode character. We can know what the whole term is by
// checking only the first character.
- return !icu_i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
- !icu_i18n_utils::IsPunctuationAt(term, /*position=*/0);
+ return !i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
+ !i18n_utils::IsPunctuationAt(term, /*position=*/0);
}
} // namespace
@@ -96,6 +96,18 @@ class PlainTokenIterator : public Tokenizer::Iterator {
return true;
}
+ bool ResetToStart() override {
+ if (!base_iterator_->ResetToStart().ok()) {
+ return false;
+ }
+ current_term_ = base_iterator_->GetTerm();
+ if (!IsValidTerm(current_term_)) {
+ // If the current value isn't valid, advance to the next valid value.
+ return Advance();
+ }
+ return true;
+ }
+
private:
std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_;
std::string_view current_term_;
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index e7d6e29..f2fc678 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -18,7 +18,7 @@
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/test-data.h"
@@ -49,9 +49,8 @@ TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) {
}
TEST_F(PlainTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -82,9 +81,8 @@ TEST_F(PlainTokenizerTest, Simple) {
}
TEST_F(PlainTokenizerTest, Whitespace) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -109,9 +107,8 @@ TEST_F(PlainTokenizerTest, Whitespace) {
}
TEST_F(PlainTokenizerTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -139,9 +136,8 @@ TEST_F(PlainTokenizerTest, Punctuation) {
}
TEST_F(PlainTokenizerTest, SpecialCharacters) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -161,9 +157,8 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) {
}
TEST_F(PlainTokenizerTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -214,9 +209,8 @@ TEST_F(PlainTokenizerTest, CJKT) {
}
TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -232,9 +226,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
}
TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -250,9 +243,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
}
TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -299,9 +291,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
}
TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 6819f8d..8b2edc9 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -29,7 +29,7 @@
#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
// This file provides rules that tell the tokenizer what to do when it sees a
@@ -316,7 +316,7 @@ TermType GetTermType(std::string_view term) {
return OR_OPERATOR;
}
// Checks the first char to see if it's an ASCII term
- if (icu_i18n_utils::IsAscii(term[0])) {
+ if (i18n_utils::IsAscii(term[0])) {
if (std::isalnum(term[0])) {
return ALPHANUMERIC_TERM;
}
@@ -381,7 +381,7 @@ libtextclassifier3::Status OutputToken(State new_state,
case ALPHANUMERIC_TERM:
if (new_state == PROCESSING_PROPERTY_TERM) {
// Asserts extra rule 1: property name must be in ASCII
- if (!icu_i18n_utils::IsAscii(current_term[0])) {
+ if (!i18n_utils::IsAscii(current_term[0])) {
return absl_ports::InvalidArgumentError(
"Characters in property name must all be ASCII.");
}
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index dfcc09a..351f7c1 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -16,7 +16,7 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -46,9 +46,8 @@ TEST_F(RawQueryTokenizerTest, CreationWithNullPointerShouldFail) {
}
TEST_F(RawQueryTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -60,9 +59,8 @@ TEST_F(RawQueryTokenizerTest, Simple) {
}
TEST_F(RawQueryTokenizerTest, Parentheses) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -161,9 +159,8 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
}
TEST_F(RawQueryTokenizerTest, Exclustion) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -229,9 +226,8 @@ TEST_F(RawQueryTokenizerTest, Exclustion) {
}
TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -318,9 +314,8 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
}
TEST_F(RawQueryTokenizerTest, OR) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -440,9 +435,8 @@ TEST_F(RawQueryTokenizerTest, OR) {
// CJKT are treated the same way by language segmenter and raw tokenizer, so
// here we test Chinese and Japanese to represent CJKT.
TEST_F(RawQueryTokenizerTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -494,9 +488,8 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
// Raw tokenizer identifies all characters that it doesn't know as OTHER type,
// so we can choose comma "," to represent all OTHER characters.
TEST_F(RawQueryTokenizerTest, OtherChars) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -540,9 +533,8 @@ TEST_F(RawQueryTokenizerTest, OtherChars) {
}
TEST_F(RawQueryTokenizerTest, Mix) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::ICU4C));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
new file mode 100644
index 0000000..f79bc68
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+namespace {
+constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
+} // namespace
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+// A LanguageSegmenter on success
+// INVALID_ARGUMENT if locale string is invalid
+//
+// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// users. Right now illegal locale strings will be ignored by ICU. ICU
+// components will be created with its default locale.
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+ SegmenterOptions options) {
+ if (options.jni_cache == nullptr) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot create Reverse Jni Language Segmenter without a valid JniCache "
+ "pointer");
+ }
+ // Word connector rules for "en_US_POSIX" (American English (Computer)) are
+ // different from other locales. E.g. "email.subject" will be split into 3
+ // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
+ // term in other locales. Our current LanguageSegmenter doesn't handle this
+ // special rule, so we replace it with "en_US".
+ if (options.locale == kLocaleAmericanEnglishComputer) {
+ ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
+ << " not supported. Converting to locale " << ULOC_US;
+ options.locale = ULOC_US;
+ }
+ return std::make_unique<ReverseJniLanguageSegmenter>(
+ std::move(options.locale), options.jni_cache);
+}
+
+} // namespace language_segmenter_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
new file mode 100644
index 0000000..8392363
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
@@ -0,0 +1,37 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain(
+ JNIEnv* env, jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "reverse-jni-language-segmenter-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
new file mode 100644
index 0000000..a01d944
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
@@ -0,0 +1,1085 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h"
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace test_internal {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+namespace {
+
+language_segmenter_factory::SegmenterOptions GetSegmenterOptions(
+ const std::string& locale, const JniCache* jni_cache) {
+ return language_segmenter_factory::SegmenterOptions(locale, jni_cache);
+}
+
+// Returns a vector containing all terms retrieved by Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsAdvance(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetAfter with
+// the current position to simulate Advancing on the iterator.
+std::vector<std::string_view> GetAllTermsResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ std::vector<std::string_view> terms;
+ if (!itr->ResetToStart().ok()) {
+ return terms;
+ }
+ terms.push_back(itr->GetTerm());
+ const char* text_begin = itr->GetTerm().data();
+ // Calling ResetToTermStartingAfter with the current position should get the
+ // very next term in the sequence.
+ for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by alternating calls to
+// Advance and calls to ResetAfter with the current position to simulate
+// Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ std::vector<std::string_view> terms;
+
+ bool is_ok = true;
+ int current_pos = 0;
+ while (is_ok) {
+ // Alternate between using Advance and ResetToTermAfter.
+ if (terms.size() % 2 == 0) {
+ is_ok = itr->Advance();
+ } else {
+ // Calling ResetToTermStartingAfter with the current position should get
+ // the very next term in the sequence.
+ current_pos = itr->GetTerm().data() - text_begin;
+ is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
+ }
+ if (is_ok) {
+ terms.push_back(itr->GetTerm());
+ }
+ }
+ return terms;
+}
+
+// Returns a vector containing all terms retrieved by calling ResetBefore with
+// the current position, starting at the end of the text. This vector should be
+// in reverse order of GetAllTerms and missing the last term.
+std::vector<std::string_view> GetAllTermsResetBefore(
+ LanguageSegmenter::Iterator* itr) {
+ const char* text_begin = itr->GetTerm().data();
+ int last_pos = 0;
+ while (itr->Advance()) {
+ last_pos = itr->GetTerm().data() - text_begin;
+ }
+ std::vector<std::string_view> terms;
+ // Calling ResetToTermEndingBefore with the current position should get the
+ // previous term in the sequence.
+ for (int current_pos = last_pos;
+ itr->ResetToTermEndingBefore(current_pos).ok();
+ current_pos = itr->GetTerm().data() - text_begin) {
+ terms.push_back(itr->GetTerm());
+ }
+ return terms;
+}
+
+} // namespace
+
+TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, SimpleText) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ASCII_Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // ASCII punctuation marks are kept
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("Hello, World!!!"),
+ IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
+ IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
+ IsOkAndHolds(ElementsAre("100", "%")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A&B"),
+ IsOkAndHolds(ElementsAre("A", "&", "B")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ASCII_SpecialCharacter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // ASCII special characters are kept
+ EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
+ IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("A+B"),
+ IsOkAndHolds(ElementsAre("A", "+", "B")));
+ // 0x0009 is the unicode for tab (within ASCII range).
+ std::string text_with_tab = absl_ports::StrCat(
+ "Hello", UCharToString(0x0009), UCharToString(0x0009), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab),
+ IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009),
+ UCharToString(0x0009), "World")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Full-width (non-ASCII) punctuation marks and special characters are left
+ // out.
+ EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
+ IsOkAndHolds(ElementsAre("Hello")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Acronym) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("U.S.𡔖 Bank"),
+ IsOkAndHolds(ElementsAre("U.S", ".", "𡔖", " ", "Bank")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
+ IsOkAndHolds(ElementsAre("I.B.M", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
+ IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("I B M"),
+ IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // According to unicode word break rules
+ // WB6(https://unicode.org/reports/tr29/#WB6),
+ // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
+ // punctuation characters are used as word connecters. That is, words don't
+ // break before and after them. Here we just test some that we care about.
+
+ // Word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
+ IsOkAndHolds(ElementsAre("com.google.android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com:google:android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
+ IsOkAndHolds(ElementsAre("com'google'android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
+ IsOkAndHolds(ElementsAre("com_google_android")));
+
+ // Word connecters can be mixed
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
+ IsOkAndHolds(ElementsAre("com.google.android:icing")));
+
+ // Any heading and trailing characters are not connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
+ IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
+
+ // Not word connecters
+ EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"),
+ IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"),
+ IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"),
+ IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
+ IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
+ IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
+ IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"),
+ IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"),
+ IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"),
+ IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com\"google\"android"),
+ IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Apostrophes) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
+ IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
+ IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."),
+ IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", ".")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"),
+ IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone")));
+ // 0x2019 is the single right quote, should be treated the same as "'"
+ std::string token_with_quote =
+ absl_ports::StrCat("He", UCharToString(0x2019), "ll");
+ std::string text_with_quote =
+ absl_ports::StrCat(token_with_quote, " be back.");
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms(text_with_quote),
+ IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", ".")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Parentheses) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
+ IsOkAndHolds(ElementsAre("(", "Hello", ")")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("),
+ IsOkAndHolds(ElementsAre(")", "Hello", "(")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Quotes) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
+ IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"),
+ IsOkAndHolds(ElementsAre("'", "Hello", "'")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Alphanumeric) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
+ IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, Number) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ // Alphanumeric terms are allowed
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
+ IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
+ IsOkAndHolds(ElementsAre("3,456.789")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
+ IsOkAndHolds(ElementsAre("-", "123")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Multiple continuous whitespaces are treated as one.
+ const int kNumSeparators = 256;
+ std::string text_with_spaces =
+ absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
+ EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
+ IsOkAndHolds(ElementsAre("Hello", " ", "World")));
+
+ // Multiple continuous whitespaces are treated as one. Whitespace at the
+ // beginning of the text doesn't affect the results of GetTerm() after the
+ // iterator is done.
+ text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
+ "Hello", " ", "World");
+ ICING_ASSERT_OK_AND_ASSIGN(auto itr,
+ language_segmenter->Segment(text_with_spaces));
+ std::vector<std::string_view> terms;
+ while (itr->Advance()) {
+ terms.push_back(itr->GetTerm());
+ }
+ EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World"));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, CJKT) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
+ // have whitespaces as word delimiter.
+
+ // Chinese
+ EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+ // Japanese
+ EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
+ "い", "てい", "ます")));
+ // Khmer
+ EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+ // Thai
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, LatinLettersWithAccents) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
+ IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
+}
+
+// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
+TEST_P(ReverseJniLanguageSegmenterTest, WhitespaceSplitLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Turkish
+ EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
+ IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
+ // Korean
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
+}
+
+// TODO(samzheng): more mixed languages test cases
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguages) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
+ IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
+ "吗", "お", "元気", "です", "か")));
+
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("나는 California에 산다"),
+ IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다")));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, NotCopyStrings) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Validates that the input strings are not copied
+ const std::string text = "Hello World";
+ const char* word1_address = text.c_str();
+ const char* word2_address = text.c_str() + 6;
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(text));
+ ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
+ const char* word1_result_address = terms.at(0).data();
+ const char* word2_result_address = terms.at(2).data();
+
+ // The underlying char* should be the same
+ EXPECT_THAT(word1_address, Eq(word1_result_address));
+ EXPECT_THAT(word2_address, Eq(word2_result_address));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ ASSERT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+}
+
+// Tests that ResetToTermAfter and Advance produce the same output. With the
+// exception of the first term which is inacessible via ResetToTermAfter,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermAfter calls with the current position
+// provided as the argument.
+TEST_P(ReverseJniLanguageSegmenterTest,
+ MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ThaiResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ KoreanResetToTermAfterEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetAfter(reset_to_term_itr.get());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as
+// ResetToTermAfter(current_position) can be used to simulate Advance, users
+// should be able to mix ResetToTermAfter(current_position) calls and Advance
+// calls to mimic calling Advance.
+TEST_P(ReverseJniLanguageSegmenterTest,
+ MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ThaiResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ KoreanResetToTermAfterInteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_and_reset_terms =
+ GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+
+ EXPECT_THAT(advance_and_reset_terms,
+ testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->GetTerm(), Eq("你好"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->GetTerm(), Eq("吗"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("World"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // Bytes: 0 3 9 15 18
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("每天"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("走路"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+ EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42 51
+ EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ ASSERT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+}
+
+// Tests that ResetToTermBefore and Advance produce the same output. With the
+// exception of the last term which is inacessible via ResetToTermBefore,
+// the stream of terms produced by Advance calls should exacly match the
+// terms produced by ResetToTermBefore calls with the current position
+// provided as the argument (after their order has been reversed).
+TEST_P(ReverseJniLanguageSegmenterTest,
+ MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kText));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty());
+ EXPECT_THAT(advance_itr->GetTerm(), IsEmpty());
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ThaiResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kThai));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ KoreanResetToTermBeforeEquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKorean = "나는 매일 출근합니다.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> advance_terms =
+ GetAllTermsAdvance(advance_itr.get());
+ // Can't produce the last term via calls to ResetToTermBefore. So skip
+ // past that one.
+ auto itr = advance_terms.begin();
+ std::advance(itr, advance_terms.size() - 1);
+ advance_terms.erase(itr);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
+ segmenter->Segment(kKorean));
+ std::vector<std::string_view> reset_terms =
+ GetAllTermsResetBefore(reset_to_term_itr.get());
+ std::reverse(reset_terms.begin(), reset_terms.end());
+
+ EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
+ EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment("How are you你好吗お元気ですか"));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->GetTerm(), Eq("are"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("元気"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("you"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->GetTerm(), Eq("です"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ContinuousWhitespacesResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Multiple continuous whitespaces are treated as one.
+ constexpr std::string_view kTextWithSpace = "Hello World";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kTextWithSpace));
+
+ // String: "Hello World"
+ // ^ ^ ^
+ // Bytes: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
+ // don't have whitespaces as word delimiter. Chinese
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kChinese));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // Bytes: 0 3 9 15 18
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("我"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->GetTerm(), Eq("去"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Japanese
+ constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kJapanese));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 6 12 18212427 33
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->GetTerm(), Eq("てい"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->GetTerm(), Eq("は"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kKhmer));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // Bytes: 0 9 24 45
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Thai
+ constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(kThai));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // Bytes: 0 9 21 27 42 51
+ EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(itr->GetTerm(), IsEmpty());
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
+
+ EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ LocaleName, ReverseJniLanguageSegmenterTest,
+ testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
+ ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN,
+ ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE,
+ ULOC_TRADITIONAL_CHINESE,
+ "es_ES", // Spanish
+ "hi_IN", // Hindi
+ "th_TH", // Thai
+ "lo_LA", // Lao
+ "km_KH", // Khmer
+ "ar_DZ", // Arabic
+ "ru_RU", // Russian
+ "pt_PT", // Portuguese
+ "en_US_POSIX" // American English (Computer)
+ "wrong_locale" // Will fall back to ICU default locale
+ "" // Will fall back to ICU default locale
+ ));
+
+} // namespace test_internal
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
new file mode 100644
index 0000000..64b68ec
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
+
+#include <jni.h>
+
+#include "icing/jni/jni-cache.h"
+#include "gtest/gtest.h"
+
+extern JNIEnv* g_jenv;
+
+namespace icing {
+namespace lib {
+
+namespace test_internal {
+
+class ReverseJniLanguageSegmenterTest
+ : public testing::TestWithParam<const char*> {
+ protected:
+ ReverseJniLanguageSegmenterTest()
+ : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {}
+
+ static std::string GetLocale() { return GetParam(); }
+
+ std::unique_ptr<JniCache> jni_cache_;
+};
+
+} // namespace test_internal
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
new file mode 100644
index 0000000..2256022
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -0,0 +1,452 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
+
+#include <cctype>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/jni/reverse-jni-break-iterator.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Returns the lead byte of the UTF-8 character that includes the byte at
+// current_byte_index within it.
+int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
+ while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
+ --current_byte_index;
+ }
+ return current_byte_index;
+}
+
+class CharacterIterator {
+ public:
+ explicit CharacterIterator(std::string_view text)
+ : CharacterIterator(text, 0, 0) {}
+ CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
+ : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
+
+ // Moves from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: desired_utf8_index <= text_.length()
+ // desired_utf8_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work forwards.
+ while (utf8_index_ < desired_utf8_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > desired_utf8_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+ }
+
+ // Moves from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: 0 <= desired_utf8_index
+ bool RewindToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index < 0) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work backwards.
+ while (utf8_index_ > desired_utf8_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ return false;
+ }
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+ }
+
+ // Advances current position to desired_utf16_index.
+ // REQUIRES: desired_utf16_index <= text_.utf16_length()
+ // desired_utf16_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf16(int desired_utf16_index) {
+ while (utf16_index_ < desired_utf16_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+ if (utf16_index_ + utf16_length > desired_utf16_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += utf16_length;
+ }
+ return true;
+ }
+
+ // Rewinds current position to desired_utf16_index.
+ // REQUIRES: 0 <= desired_utf16_index
+ bool RewindToUtf16(int desired_utf16_index) {
+ if (desired_utf16_index < 0) {
+ return false;
+ }
+ while (utf16_index_ > desired_utf16_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+ }
+
+ bool IsValidCharacter() const {
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[utf8_index_])) {
+ return true;
+ }
+
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) {
+ return true;
+ }
+
+ return false;
+ }
+
+ int utf8_index() const { return utf8_index_; }
+ int utf16_index() const { return utf16_index_; }
+
+ private:
+ std::string_view text_;
+ int utf8_index_;
+ int utf16_index_;
+};
+
+} // namespace
+
+class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
+ public:
+ explicit ReverseJniLanguageSegmenterIterator(
+ std::string_view text,
+ std::unique_ptr<ReverseJniBreakIterator> break_iterator)
+ : break_iterator_(std::move(break_iterator)),
+ text_(text),
+ term_start_(text),
+ term_end_exclusive_(text) {}
+
+ // Advances to the next term. Returns false if it has reached the end.
+ bool Advance() override {
+ // Prerequisite check
+ if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ return false;
+ }
+
+ if (term_end_exclusive_.utf16_index() == 0) {
+ int first = break_iterator_->First();
+ if (!term_start_.AdvanceToUtf16(first)) {
+ // First is guaranteed to succeed and return a position within bonds. So
+ // the only possible failure could be an invalid sequence. Mark as DONE
+ // and return.
+ MarkAsDone();
+ return false;
+ }
+ } else {
+ term_start_ = term_end_exclusive_;
+ }
+
+ int next_utf16_index_exclusive = break_iterator_->Next();
+ // Reached the end
+ if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
+ MarkAsDone();
+ return false;
+ }
+ if (!term_end_exclusive_.AdvanceToUtf16(next_utf16_index_exclusive)) {
+ // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
+ // the check for kDone above. So the only possible failure could be an
+ // invalid sequence. Mark as DONE and return.
+ MarkAsDone();
+ return false;
+ }
+
+ // Check if the current term is valid. We consider any term valid if its
+ // first character is valid. If it's not valid, then we need to advance to
+ // the next term.
+ if (term_start_.IsValidCharacter()) {
+ return true;
+ }
+ return Advance();
+ }
+
+ // Returns the current term. It can be called only when Advance() returns
+ // true.
+ std::string_view GetTerm() const override {
+ int term_length =
+ term_end_exclusive_.utf8_index() - term_start_.utf8_index();
+ if (term_length > 0 && std::isspace(text_[term_start_.utf8_index()])) {
+ // Rule 3: multiple continuous whitespaces are treated as one.
+ term_length = 1;
+ }
+ return text_.substr(term_start_.utf8_index(), term_length);
+ }
+
+ // Resets the iterator to point to the first term that starts after offset.
+ // GetTerm will now return that term.
+ //
+ // Returns:
+ // On success, the starting position of the first term that starts after
+ // offset.
+ // NOT_FOUND if an error occurred or there are no terms that start after
+ // offset.
+ // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ int32_t offset) override {
+ if (offset < 0 || offset >= text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset %d is not within bounds of string "
+ "of length %zu",
+ offset, text_.length()));
+ }
+ if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ // We're done. Need to start from the beginning if we're going to reset
+ // properly.
+ term_start_ = CharacterIterator(text_);
+ term_end_exclusive_ = CharacterIterator(text_);
+ }
+
+ // 1. Find the unicode character that contains the byte at offset.
+ CharacterIterator offset_iterator = term_end_exclusive_;
+ bool success = (offset > offset_iterator.utf8_index())
+ ? offset_iterator.AdvanceToUtf8(offset)
+ : offset_iterator.RewindToUtf8(offset);
+ if (!success) {
+ // Offset is guaranteed to be within bounds thanks to the check above. So
+ // the only possible failure could be an invalid sequence. Mark as DONE
+ // and return.
+ MarkAsDone();
+ return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+ }
+
+ // 2. We've got the unicode character containing byte offset. Now, we need
+ // to point to the segment that starts after this character.
+ int following_utf16_index =
+ break_iterator_->Following(offset_iterator.utf16_index());
+ if (following_utf16_index == ReverseJniBreakIterator::kDone) {
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ if (!offset_iterator.AdvanceToUtf16(following_utf16_index)) {
+ // following_utf16_index is guaranteed to be within bonds thanks to the
+ // check for kDone above. So the only possible failure could be an invalid
+ // sequence. Mark as DONE and return.
+ MarkAsDone();
+ return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+ }
+ term_end_exclusive_ = offset_iterator;
+
+ // 3. The term_end_exclusive_ points to the term that we want to return. We
+ // need to Advance so that term_start_ will now point to this term.
+ if (!Advance()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments begin after provided offset %d.", offset));
+ }
+ return term_start_.utf8_index();
+ }
+
+ // Resets the iterator to point to the first term that ends before offset.
+ // GetTerm will now return that term.
+ //
+ // Returns:
+ // On success, the starting position of the first term that ends before
+ // offset.
+ // NOT_FOUND if an error occurred or there are no terms that end before
+ // offset.
+ // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ int32_t offset) override {
+ if (offset < 0 || offset >= text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset %d is not within bounds of string "
+ "of length %zu",
+ offset, text_.length()));
+ }
+ if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ // We're done. Need to start from the beginning if we're going to reset
+ // properly.
+ term_start_ = CharacterIterator(text_);
+ term_end_exclusive_ = CharacterIterator(text_);
+ }
+
+ // 1. Find the unicode character that contains the byte at offset.
+ CharacterIterator offset_iterator = term_end_exclusive_;
+ bool success = (offset > offset_iterator.utf8_index())
+ ? offset_iterator.AdvanceToUtf8(offset)
+ : offset_iterator.RewindToUtf8(offset);
+ if (!success) {
+ // Offset is guaranteed to be within bounds thanks to the check above. So
+ // the only possible failure could be an invalid sequence. Mark as DONE
+ // and return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+
+ // 2. We've got the unicode character containing byte offset. Now, we need
+ // to point to the segment that starts before this character.
+ int starting_utf16_index =
+ break_iterator_->Preceding(offset_iterator.utf16_index());
+ if (starting_utf16_index == ReverseJniBreakIterator::kDone) {
+ // Rewind the end indices.
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments end before provided offset %d.", offset));
+ }
+ if (!offset_iterator.RewindToUtf16(starting_utf16_index)) {
+ // starting_utf16_index is guaranteed to be within bonds thanks to the
+ // check for kDone above. So the only possible failure could be an invalid
+ // sequence. Mark as DONE and return.
+ MarkAsDone();
+ return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+ }
+ term_start_ = offset_iterator;
+
+ // 3. We've correctly set the start index and the iterator currently points
+ // to that position. Now we need to find the correct end position and
+ // advance the iterator to that position.
+ int end_utf16_index = break_iterator_->Next();
+ term_end_exclusive_ = term_start_;
+ term_end_exclusive_.AdvanceToUtf16(end_utf16_index);
+
+ // 4. The start and end indices point to a segment, but we need to ensure
+ // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
+ // need a segment prior to this one.
+ if (term_end_exclusive_.utf8_index() > offset ||
+ !term_start_.IsValidCharacter()) {
+ return ResetToTermEndingBefore(term_start_.utf8_index());
+ }
+ return term_start_.utf8_index();
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ term_start_ = CharacterIterator(text_);
+ term_end_exclusive_ = CharacterIterator(text_);
+ if (!Advance()) {
+ return absl_ports::NotFoundError("");
+ }
+ return term_start_.utf8_index();
+ }
+
+ private:
+ // Ensures that all members are consistent with the 'Done' state.
+ // In the 'Done' state, both term_start_.utf8_index() and
+ // term_end_exclusive_.utf8_index() will point to the same character, causing
+ // GetTerm() to return an empty string and term_start_.utf16_index() and
+ // term_end_exclusive_.utf16_index() will be marked with the kDone value.
+ // break_iterator_ may be in any state.
+ void MarkAsDone() {
+ term_start_ =
+ CharacterIterator(text_, /*utf8_index=*/0,
+ /*utf16_index=*/ReverseJniBreakIterator::kDone);
+ term_end_exclusive_ =
+ CharacterIterator(text_, /*utf8_index=*/0,
+ /*utf16_index=*/ReverseJniBreakIterator::kDone);
+ }
+
+ // All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
+ // this class needs to maintain state to convert between UTF-16 and UTF-8.
+ std::unique_ptr<ReverseJniBreakIterator> break_iterator_;
+
+ // Text to be segmented
+ std::string_view text_;
+
+ // Index used to track the start position of current term.
+ CharacterIterator term_start_;
+
+ // Index used to track the end position of current term.
+ CharacterIterator term_end_exclusive_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ReverseJniLanguageSegmenter::Segment(const std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<ReverseJniBreakIterator> break_iterator,
+ ReverseJniBreakIterator::Create(jni_cache_, text, locale_));
+ return std::make_unique<ReverseJniLanguageSegmenterIterator>(
+ text, std::move(break_iterator));
+}
+
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+ReverseJniLanguageSegmenter::GetAllTerms(const std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
+ Segment(text));
+ std::vector<std::string_view> terms;
+ while (iterator->Advance()) {
+ terms.push_back(iterator->GetTerm());
+ }
+ return terms;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
new file mode 100644
index 0000000..f06dac9
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/jni/jni-cache.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+class ReverseJniLanguageSegmenter : public LanguageSegmenter {
+ public:
+ ReverseJniLanguageSegmenter(std::string locale, const JniCache* jni_cache)
+ : locale_(std::move(locale)), jni_cache_(jni_cache) {}
+
+ libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
+ Segment(std::string_view text) const override;
+
+ libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
+ std::string_view text) const override;
+
+ private:
+ std::string locale_;
+
+ const JniCache* jni_cache_; // does not own!
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/simple/space-language-segmenter-factory.cc b/icing/tokenization/simple/space-language-segmenter-factory.cc
new file mode 100644
index 0000000..1cca603
--- /dev/null
+++ b/icing/tokenization/simple/space-language-segmenter-factory.cc
@@ -0,0 +1,41 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/simple/space-language-segmenter.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace language_segmenter_factory {
+
+// Creates a language segmenter with the given locale.
+//
+// Returns:
+// A LanguageSegmenter on success
+// INVALID_ARGUMENT if locale string is invalid
+//
+// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// users. Right now illegal locale strings will be ignored by ICU. ICU
+// components will be created with its default locale.
+libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
+ SegmenterOptions) {
+ return std::make_unique<SpaceLanguageSegmenter>();
+}
+
+} // namespace language_segmenter_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/space-language-segmenter.cc b/icing/tokenization/simple/space-language-segmenter.cc
index 3d5c7cf..7e301ec 100644
--- a/icing/tokenization/space-language-segmenter.cc
+++ b/icing/tokenization/simple/space-language-segmenter.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/tokenization/space-language-segmenter.h"
+#include "icing/tokenization/simple/space-language-segmenter.h"
#include <cstdint>
#include <memory>
@@ -40,7 +40,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
: text_(text), term_start_index_(0), term_end_index_exclusive_(0) {}
// Advances to the next term. Returns false if it has reached the end.
- bool Advance() {
+ bool Advance() override {
if (term_end_index_exclusive_ >= text_.size() ||
term_start_index_ >= text_.size()) {
// Reached the end
@@ -74,7 +74,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Returns the current term. It can be called only when Advance() returns
// true.
- std::string_view GetTerm() const {
+ std::string_view GetTerm() const override {
if (text_[term_start_index_] == kASCIISpace) {
// Rule: multiple continuous whitespaces are treated as one.
return std::string_view(&text_[term_start_index_], 1);
@@ -84,7 +84,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
}
libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
- int32_t offset) {
+ int32_t offset) override {
if (offset < 0) {
// Start over from the beginning to find the first term.
term_start_index_ = 0;
@@ -111,7 +111,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
}
libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
- int32_t offset) {
+ int32_t offset) override {
if (offset <= 0 || offset > text_.size()) {
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No term found in '%s' that ends before offset %d",
@@ -146,6 +146,15 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return term_start_index_;
}
+ libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ term_start_index_ = 0;
+ term_end_index_exclusive_ = 0;
+ if (!Advance()) {
+ return absl_ports::NotFoundError("");
+ }
+ return term_start_index_;
+ }
+
private:
// Return the start offset of the term starting right before the given offset.
libtextclassifier3::StatusOr<int32_t> GetTermStartingBefore(int32_t offset) {
diff --git a/icing/tokenization/space-language-segmenter.h b/icing/tokenization/simple/space-language-segmenter.h
index 73f8f30..de0a6d3 100644
--- a/icing/tokenization/space-language-segmenter.h
+++ b/icing/tokenization/simple/space-language-segmenter.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_
-#define ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_
+#ifndef ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
+#define ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
#include <cstdint>
#include <memory>
@@ -55,4 +55,4 @@ class SpaceLanguageSegmenter : public LanguageSegmenter {
} // namespace lib
} // namespace icing
-#endif // ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_
+#endif // ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
index ef6f54f..8ed38b2 100644
--- a/icing/tokenization/space-language-segmenter_test.cc
+++ b/icing/tokenization/simple/space-language-segmenter_test.cc
@@ -28,24 +28,21 @@ using ::testing::Eq;
using ::testing::IsEmpty;
TEST(SpaceLanguageSegmenterTest, EmptyText) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
}
TEST(SpaceLanguageSegmenterTest, SimpleText) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
IsOkAndHolds(ElementsAre("Hello", " ", "World")));
}
TEST(SpaceLanguageSegmenterTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
@@ -58,9 +55,8 @@ TEST(SpaceLanguageSegmenterTest, Punctuation) {
}
TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
// Alphanumeric terms are allowed
EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
@@ -68,9 +64,8 @@ TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
}
TEST(SpaceLanguageSegmenterTest, Number) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
// Alphanumeric terms are allowed
EXPECT_THAT(
@@ -85,9 +80,8 @@ TEST(SpaceLanguageSegmenterTest, Number) {
}
TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
// Multiple continuous whitespaces are treated as one.
const int kNumSeparators = 256;
@@ -98,9 +92,8 @@ TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
}
TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto language_segmenter,
- language_segmenter_factory::Create(language_segmenter_factory::SPACE));
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create());
// Validates that the input strings are not copied
const std::string text = "Hello World";
const char* word1_address = text.c_str();
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 3ad61fb..38c4745 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -85,6 +85,8 @@ class Tokenizer {
// // "foo".
// PrintToken(iterator.GetToken()); // prints "foo"
virtual bool ResetToTokenBefore(int32_t offset) { return false; }
+
+ virtual bool ResetToStart() { return false; }
};
// Tokenizes the input text. The input text should outlive the returned
diff --git a/icing/transform/icu-normalizer_test.cc b/icing/transform/icu-normalizer_test.cc
deleted file mode 100644
index 5e822d2..0000000
--- a/icing/transform/icu-normalizer_test.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/icu-data-file-helper.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/icu-i18n-test-utils.h"
-#include "icing/testing/test-data.h"
-#include "icing/transform/normalizer-factory.h"
-#include "icing/transform/normalizer.h"
-
-namespace icing {
-namespace lib {
-namespace {
-using ::testing::Eq;
-
-class IcuNormalizerTest : public testing::Test {
- protected:
- void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- normalizer_,
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/1024));
- }
-
- std::unique_ptr<Normalizer> normalizer_;
-};
-
-TEST_F(IcuNormalizerTest, Creation) {
- EXPECT_THAT(
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/5),
- IsOk());
- EXPECT_THAT(
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/0),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-// Strings that are already normalized won't change if normalized again.
-TEST_F(IcuNormalizerTest, AlreadyNormalized) {
- EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq(""));
- EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world"));
- EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好"));
- EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス"));
- EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
-}
-
-TEST_F(IcuNormalizerTest, UppercaseToLowercase) {
- EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi"));
- EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing"));
-}
-
-TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
- EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich"));
- EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi"));
- EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias"));
- EXPECT_THAT(normalizer_->NormalizeTerm("āăąḃḅḇčćç"), Eq("aaabbbccc"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ÁȦÄḂḄḆĆČḈ"), Eq("aaabbbccc"));
-}
-
-// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
-// Japanese and Greek
-TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) {
- EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
- EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
-}
-
-TEST_F(IcuNormalizerTest, FullWidthCharsToASCII) {
- // Full-width punctuation to ASCII punctuation
- EXPECT_THAT(normalizer_->NormalizeTerm("。,!?:”"), Eq(".,!?:\""));
- // 0xff10 is the full-width number 0
- EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff10)), Eq("0"));
- // 0xff21 is the full-width letter A
- EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff21)), Eq("a"));
- // 0xff41 is the full-width letter a
- EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff41)), Eq("a"));
-}
-
-// For Katakana, each character is normalized to its full-width version.
-TEST_F(IcuNormalizerTest, KatakanaHalfWidthToFullWidth) {
- EXPECT_THAT(normalizer_->NormalizeTerm("カ"), Eq("カ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ォ"), Eq("ォ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("サ"), Eq("サ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ホ"), Eq("ホ"));
-}
-
-TEST_F(IcuNormalizerTest, HiraganaToKatakana) {
- EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ぎゃぎゅぎょ"), Eq("ギャギュギョ"));
-}
-
-TEST_F(IcuNormalizerTest, SuperscriptAndSubscriptToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9"));
- EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9"));
-}
-
-TEST_F(IcuNormalizerTest, CircledCharsToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1"));
- EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a"));
-}
-
-TEST_F(IcuNormalizerTest, RotatedCharsToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{"));
- EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}"));
-}
-
-TEST_F(IcuNormalizerTest, SquaredCharsToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート"));
-}
-
-TEST_F(IcuNormalizerTest, FractionsToASCII) {
- EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4"));
- EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6"));
-}
-
-TEST_F(IcuNormalizerTest, Truncate) {
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto normalizer,
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/5));
-
- // Won't be truncated
- EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
- EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
-
- // Truncated to length 5.
- EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
-
- // Each Japanese character has 3 bytes, so truncating to length 5 results in
- // only 1 character.
- EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
-
- // Each Greek character has 2 bytes, so truncating to length 5 results in 2
- // character.
- EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
- }
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto normalizer,
- normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
- /*max_term_byte_size=*/2));
- // The Japanese character has 3 bytes, truncating it results in an empty
- // string.
- EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
- }
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/transform/icu/icu-normalizer-factory.cc b/icing/transform/icu/icu-normalizer-factory.cc
new file mode 100644
index 0000000..493aeb5
--- /dev/null
+++ b/icing/transform/icu/icu-normalizer-factory.cc
@@ -0,0 +1,52 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
+#define ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/icu/icu-normalizer.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates an ICU-based normalizer. max_term_byte_size enforces the max size of
+// text after normalization, text will be truncated if exceeds the max size.
+//
+// Returns:
+// A normalizer on success
+// INVALID_ARGUMENT if max_term_byte_size <= 0
+// INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size) {
+ if (max_term_byte_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "max_term_byte_size must be greater than zero.");
+ }
+ return IcuNormalizer::Create(max_term_byte_size);
+}
+
+} // namespace normalizer_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc
index c7cfd99..0bb8326 100644
--- a/icing/transform/icu-normalizer.cc
+++ b/icing/transform/icu/icu-normalizer.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/transform/icu-normalizer.h"
+#include "icing/transform/icu/icu-normalizer.h"
#include <cctype>
#include <memory>
@@ -24,7 +24,7 @@
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/transform/normalizer.h"
-#include "icing/util/icu-i18n-utils.h"
+#include "icing/util/i18n-utils.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
#include "unicode/umachine.h"
@@ -55,6 +55,46 @@ constexpr UChar kTransformRulesUtf16[] =
constexpr int kTransformRulesLength =
sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1;
+// Transforms a Unicode character with diacritics to its counterpart in ASCII
+// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
+// the transformation is successful.
+//
+// NOTE: According to our convention this function should have returned
+// StatusOr<char>. However, this function is performance-sensitive because is
+// could be called on every Latin character in normalization, so we make it
+// return a bool here to save a bit more time and memory.
+bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
+ char* char_out) {
+ if (i18n_utils::IsAscii(uchar32_in)) {
+ // The Unicode character is within ASCII range
+ if (char_out != nullptr) {
+ *char_out = uchar32_in;
+ }
+ return true;
+ }
+
+ // Maximum number of pieces a Unicode character can be decomposed into.
+ // TODO(samzheng) figure out if this number is proper.
+ constexpr int kDecompositionBufferCapacity = 5;
+
+ // A buffer used to store Unicode decomposition mappings of only one
+ // character.
+ UChar decomposition_buffer[kDecompositionBufferCapacity];
+
+ // Decomposes the Unicode character, trying to get an ASCII char and some
+ // diacritic chars.
+ UErrorCode status = U_ZERO_ERROR;
+ if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
+ kDecompositionBufferCapacity, &status) > 0 &&
+ !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
+ if (char_out != nullptr) {
+ *char_out = decomposition_buffer[0];
+ }
+ return true;
+ }
+ return false;
+}
+
} // namespace
// Creates a IcuNormalizer with a valid TermTransformer instance.
@@ -96,11 +136,9 @@ std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
// into an ASCII char. Since the term is tokenized, we know that the whole
// term can be transformed into ASCII if the first character can.
UChar32 first_uchar32 =
- icu_i18n_utils::GetUChar32At(term.data(), term.length(), 0);
- if (normalizer2 != nullptr &&
- first_uchar32 != icu_i18n_utils::kInvalidUChar32 &&
- icu_i18n_utils::DiacriticCharToAscii(normalizer2, first_uchar32,
- nullptr)) {
+ i18n_utils::GetUChar32At(term.data(), term.length(), 0);
+ if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
+ DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) {
// This is a faster method to normalize Latin terms.
normalized_text = NormalizeLatin(normalizer2, term);
} else {
@@ -108,7 +146,7 @@ std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
}
if (normalized_text.length() > max_term_byte_size_) {
- icu_i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
+ i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
}
return normalized_text;
@@ -119,19 +157,17 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
std::string result;
result.reserve(term.length());
for (int i = 0; i < term.length(); i++) {
- if (icu_i18n_utils::IsAscii(term[i])) {
+ if (i18n_utils::IsAscii(term[i])) {
result.push_back(std::tolower(term[i]));
- } else if (icu_i18n_utils::IsLeadUtf8Byte(term[i])) {
- UChar32 uchar32 =
- icu_i18n_utils::GetUChar32At(term.data(), term.length(), i);
- if (uchar32 == icu_i18n_utils::kInvalidUChar32) {
+ } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
<< " at position" << i;
continue;
}
char ascii_char;
- if (icu_i18n_utils::DiacriticCharToAscii(normalizer2, uchar32,
- &ascii_char)) {
+ if (DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) {
result.push_back(std::tolower(ascii_char));
} else {
// We don't know how to transform / decompose this Unicode character, it
@@ -139,7 +175,7 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
// Latin characters. This shouldn't happen if input term is properly
// tokenized. We handle it here in case there're something wrong with
// the tokenizers.
- int utf8_length = icu_i18n_utils::GetUtf8Length(uchar32);
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
absl_ports::StrAppend(&result, term.substr(i, utf8_length));
}
}
@@ -175,7 +211,7 @@ IcuNormalizer::TermTransformer::~TermTransformer() {
std::string IcuNormalizer::TermTransformer::Transform(
const std::string_view term) const {
- auto utf16_term_or = icu_i18n_utils::Utf8ToUtf16(term);
+ auto utf16_term_or = i18n_utils::Utf8ToUtf16(term);
if (!utf16_term_or.ok()) {
ICING_VLOG(0) << "Failed to convert UTF8 term '" << term << "' to UTF16";
return std::string(term);
@@ -216,7 +252,7 @@ std::string IcuNormalizer::TermTransformer::Transform(
return std::string(term);
}
- auto utf8_term_or = icu_i18n_utils::Utf16ToUtf8(utf16_term);
+ auto utf8_term_or = i18n_utils::Utf16ToUtf8(utf16_term);
if (!utf8_term_or.ok()) {
ICING_VLOG(0) << "Failed to convert UTF16 term '" << term << "' to UTF8";
return std::string(term);
diff --git a/icing/transform/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h
index 86d4a64..f20a9fb 100644
--- a/icing/transform/icu-normalizer.h
+++ b/icing/transform/icu/icu-normalizer.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TRANSFORM_ICU_NORMALIZER_H_
-#define ICING_TRANSFORM_ICU_NORMALIZER_H_
+#ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
+#define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
#include <memory>
#include <string>
@@ -102,4 +102,4 @@ class IcuNormalizer : public Normalizer {
} // namespace lib
} // namespace icing
-#endif // ICING_TRANSFORM_ICU_NORMALIZER_H_
+#endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
diff --git a/icing/transform/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index 2fce32b..b037538 100644
--- a/icing/transform/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -14,7 +14,7 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
-#include "icing/icu-data-file-helper.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
@@ -22,9 +22,9 @@
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
-// //icing/transform:icu-normalizer_benchmark
+// //icing/transform/icu:icu-normalizer_benchmark
//
-// $ blaze-bin/icing/transform/icu-normalizer_benchmark
+// $ blaze-bin/icing/transform/icu/icu-normalizer_benchmark
// --benchmarks=all
//
// Run on an Android device:
@@ -33,9 +33,10 @@
//
// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
-// //icing/transform:icu-normalizer_benchmark
+// //icing/transform/icu:icu-normalizer_benchmark
//
-// $ adb push blaze-bin/icing/transform/icu-normalizer_benchmark
+// $ adb push
+// blaze-bin/icing/transform/icu/icu-normalizer_benchmark
// /data/local/tmp/
//
// $ adb shell /data/local/tmp/icu-normalizer_benchmark --benchmarks=all
@@ -60,7 +61,7 @@ void BM_NormalizeUppercase(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
- normalizer_factory::NormalizerType::ICU4C,
+
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string(state.range(0), 'A');
@@ -94,7 +95,7 @@ void BM_NormalizeAccent(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
- normalizer_factory::NormalizerType::ICU4C,
+
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -132,7 +133,7 @@ void BM_NormalizeHiragana(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
- normalizer_factory::NormalizerType::ICU4C,
+
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
new file mode 100644
index 0000000..83fa972
--- /dev/null
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -0,0 +1,237 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/test-data.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::Eq;
+
+class IcuNormalizerTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/1024));
+ }
+
+ std::unique_ptr<Normalizer> normalizer_;
+};
+
+TEST_F(IcuNormalizerTest, Creation) {
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/5),
+ IsOk());
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// Strings that are already normalized won't change if normalized again.
+TEST_F(IcuNormalizerTest, AlreadyNormalized) {
+ EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq(""));
+ EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
+}
+
+TEST_F(IcuNormalizerTest, UppercaseToLowercase) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing"));
+}
+
+TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÀÁÂÃÄÅĀĂĄḀḁàáâãäåāăą"),
+ Eq("aaaaaaaaaaaaaaaaaaaa"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ḂḄḆḃḅḇ"), Eq("bbbbbb"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÇĆĈĊČḈḉćĉċčç"), Eq("cccccccccccc"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÐĎĐḊḌḎḐḒḋḍḏḑḓďđ"),
+ Eq("ddddddddddddddd"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÈÉÊËĒĔĖĘḔḖḘḚḜḕḗḙḛḝèéêëēĕėęě"),
+ Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"),
+ Eq("hhhhhhhhhhhhh"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
+ Eq("iiiiiiiiiiiiiiiii"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"),
+ Eq("lllllllllllll"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
+ Eq("nnnnnnnnnnnnnnnn"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŌŎŐÒÓÔÕÖṌṎṐṒṍṏṑṓòóôõöōŏő"),
+ Eq("oooooooooooooooooooooooo"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ṔṖṕṗ"), Eq("pppp"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŔŖŘṘṚṜṞṙṛṝṟŕŗř"),
+ Eq("rrrrrrrrrrrrrr"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŚŜŞŠȘṠṢṤṦṨṡṣṥṧṩșśŝşš"),
+ Eq("ssssssssssssssssssss"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŢŤȚṪṬṮṰṫṭṯṱțţť"),
+ Eq("tttttttttttttt"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŨŪŬÙÚÛÜṲṴṶṸṺṳṵṷṹṻùúûüũūŭ"),
+ Eq("uuuuuuuuuuuuuuuuuuuuuuuu"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ṼṾṽṿ"), Eq("vvvv"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"),
+ Eq("zzzzzzzzzzzz"));
+}
+
+// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
+// Japanese and Greek
+TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) {
+ // Katakana
+ EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
+ // Greek
+ EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
+
+ // Our current ICU rules can't handle Hebrew properly, e.g. the accents in
+ // "אָלֶף־בֵּית עִבְרִי"
+ // will be removed.
+ // TODO (samzheng): figure out how we should handle Hebrew.
+}
+
+TEST_F(IcuNormalizerTest, FullWidthCharsToASCII) {
+ // Full-width punctuation to ASCII punctuation
+ EXPECT_THAT(normalizer_->NormalizeTerm("‘’.,!?:“”"), Eq("''.,!?:\"\""));
+ // Full-width 0-9
+ EXPECT_THAT(normalizer_->NormalizeTerm("0123456789"),
+ Eq("0123456789"));
+ // Full-width A-Z
+ EXPECT_THAT(normalizer_->NormalizeTerm(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
+ Eq("abcdefghijklmnopqrstuvwxyz"));
+ // Full-width a-z
+ EXPECT_THAT(normalizer_->NormalizeTerm(
+ "abcdefghijklmnopqrstuvwxyz"),
+ Eq("abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST_F(IcuNormalizerTest, IdeographicToASCII) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm(",。"), Eq(",."));
+}
+
+// For Katakana, each character is normalized to its full-width version.
+TEST_F(IcuNormalizerTest, KatakanaHalfWidthToFullWidth) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("カ"), Eq("カ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ォ"), Eq("ォ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("サ"), Eq("サ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ホ"), Eq("ホ"));
+}
+
+TEST_F(IcuNormalizerTest, HiraganaToKatakana) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("さしすせそ"), Eq("サシスセソ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("たちつてと"), Eq("タチツテト"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("なにぬねの"), Eq("ナニヌネノ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("はひふへほ"), Eq("ハヒフヘホ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("まみむめも"), Eq("マミムメモ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("やゆよ"), Eq("ヤユヨ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("らりるれろ"), Eq("ラリルレロ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("わゐゑを"), Eq("ワヰヱヲ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ん"), Eq("ン"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ざじずぜぞ"), Eq("ザジズゼゾ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("だぢづでど"), Eq("ダヂヅデド"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ぱぴぷぺぽ"), Eq("パピプペポ"));
+}
+
+TEST_F(IcuNormalizerTest, SuperscriptAndSubscriptToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9"));
+}
+
+TEST_F(IcuNormalizerTest, CircledCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a"));
+}
+
+TEST_F(IcuNormalizerTest, RotatedCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}"));
+}
+
+TEST_F(IcuNormalizerTest, SquaredCharsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート"));
+}
+
+TEST_F(IcuNormalizerTest, FractionsToASCII) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6"));
+}
+
+TEST_F(IcuNormalizerTest, Truncate) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/5));
+
+ // Won't be truncated
+ EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+ // Truncated to length 5.
+ EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+
+ // Each Japanese character has 3 bytes, so truncating to length 5 results in
+ // only 1 character.
+ EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
+
+ // Each Greek character has 2 bytes, so truncating to length 5 results in 2
+ // character.
+ EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/2));
+ // The Japanese character has 3 bytes, truncating it results in an empty
+ // string.
+ EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/map-normalizer-factory.cc b/icing/transform/map/map-normalizer-factory.cc
new file mode 100644
index 0000000..3bf84b3
--- /dev/null
+++ b/icing/transform/map/map-normalizer-factory.cc
@@ -0,0 +1,48 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/map/map-normalizer.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates a map-based normalizer. max_term_byte_size enforces the max size of
+// text after normalization, text will be truncated if exceeds the max size.
+//
+// Returns:
+// A normalizer on success
+// INVALID_ARGUMENT if max_term_byte_size <= 0
+// INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size) {
+ if (max_term_byte_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "max_term_byte_size must be greater than zero.");
+ }
+
+ return std::make_unique<MapNormalizer>(max_term_byte_size);
+}
+
+} // namespace normalizer_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc
new file mode 100644
index 0000000..c888551
--- /dev/null
+++ b/icing/transform/map/map-normalizer.cc
@@ -0,0 +1,86 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/transform/map/map-normalizer.h"
+
+#include <ctype.h>
+
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+
+#include "icing/absl_ports/str_cat.h"
+#include "icing/transform/map/normalization-map.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/logging.h"
+#include "unicode/utypes.h"
+
+namespace icing {
+namespace lib {
+
+std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
+ std::string normalized_text;
+ normalized_text.reserve(term.length());
+
+ for (int i = 0; i < term.length(); ++i) {
+ if (i18n_utils::IsAscii(term[i])) {
+ // The original character has 1 byte.
+ normalized_text.push_back(std::tolower(term[i]));
+ } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
+ << " at position" << i;
+ continue;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (i18n_utils::GetUtf16Length(uchar32) > 1) {
+ // All the characters we need to normalize can be encoded into a
+ // single char16_t. If this character needs more than 1 char16_t code
+ // unit, we can skip normalization and append it directly.
+ absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
+ continue;
+ }
+ // The original character can be encoded into a single char16_t.
+ const std::unordered_map<char16_t, char16_t>& normalization_map =
+ GetNormalizationMap();
+ auto iterator = normalization_map.find(static_cast<char16_t>(uchar32));
+ if (iterator != normalization_map.end()) {
+ // Found a normalization mapping. The normalized character (stored in a
+ // char16_t) can have 1 or 2 bytes.
+ if (i18n_utils::IsAscii(iterator->second)) {
+ // The normalized character has 1 byte.
+ normalized_text.push_back(
+ std::tolower(static_cast<char>(iterator->second)));
+ } else {
+ // The normalized character has 2 bytes.
+ i18n_utils::AppendUchar32ToUtf8(&normalized_text, iterator->second);
+ }
+ } else {
+ // Normalization mapping not found, append the original character.
+ absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
+ }
+ }
+ }
+
+ if (normalized_text.length() > max_term_byte_size_) {
+ i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_);
+ }
+
+ return normalized_text;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h
new file mode 100644
index 0000000..f9c0e42
--- /dev/null
+++ b/icing/transform/map/map-normalizer.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
+#define ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
+
+#include <string>
+#include <string_view>
+
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+class MapNormalizer : public Normalizer {
+ public:
+ explicit MapNormalizer(int max_term_byte_size)
+ : max_term_byte_size_(max_term_byte_size){};
+
+ // Normalizes the input term based on character mappings. The mappings
+ // contain the following categories:
+ // - Uppercase -> lowercase
+ // - Hiragana -> Katakana
+ // - Common full-width characters -> ASCII
+ // - Common ideographic punctuation marks -> ASCII
+ // - Common diacritic Latin characters -> ASCII
+ //
+ // Read more mapping details in normalization-map.cc
+ std::string NormalizeTerm(std::string_view term) const override;
+
+ private:
+ // The maximum term length allowed after normalization.
+ int max_term_byte_size_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_
diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc
new file mode 100644
index 0000000..691afc6
--- /dev/null
+++ b/icing/transform/map/map-normalizer_benchmark.cc
@@ -0,0 +1,149 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "testing/base/public/benchmark.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/transform/map:map-normalizer_benchmark
+//
+// $ blaze-bin/icing/transform/map/map-normalizer_benchmark
+// --benchmarks=all
+//
+// Run on an Android device:
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/transform/map:map-normalizer_benchmark
+//
+// $ adb push
+// blaze-bin/icing/transform/map/map-normalizer_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/map-normalizer_benchmark --benchmarks=all
+namespace icing {
+namespace lib {
+
+namespace {
+
+void BM_NormalizeUppercase(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string(state.range(0), 'A');
+
+ // Warms up. map-normalizer may need to load a static map when being invoked
+ // the first time. It takes about 0.05ms on a Pixel3 XL.
+ normalizer->NormalizeTerm(input_string);
+
+ for (auto _ : state) {
+ normalizer->NormalizeTerm(input_string);
+ }
+}
+BENCHMARK(BM_NormalizeUppercase)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_NormalizeAccent(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("àáâãā");
+ }
+
+ // Warms up. map-normalizer may need to load a static map when being invoked
+ // the first time. It takes about 0.05ms on a Pixel3 XL.
+ normalizer->NormalizeTerm(input_string);
+
+ for (auto _ : state) {
+ normalizer->NormalizeTerm(input_string);
+ }
+}
+BENCHMARK(BM_NormalizeAccent)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_NormalizeHiragana(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("あいうえお");
+ }
+
+ // Warms up. map-normalizer may need to load a static map when being invoked
+ // the first time. It takes about 0.05ms on a Pixel3 XL.
+ normalizer->NormalizeTerm(input_string);
+
+ for (auto _ : state) {
+ normalizer->NormalizeTerm(input_string);
+ }
+}
+BENCHMARK(BM_NormalizeHiragana)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc
new file mode 100644
index 0000000..b62ae0e
--- /dev/null
+++ b/icing/transform/map/map-normalizer_test.cc
@@ -0,0 +1,205 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+
+TEST(MapNormalizerTest, Creation) {
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/5),
+ IsOk());
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// Strings that are already normalized won't change if normalized again.
+TEST(MapNormalizerTest, AlreadyNormalized) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm(""), Eq(""));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world"));
+ EXPECT_THAT(normalizer->NormalizeTerm("你好"), Eq("你好"));
+ EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キャンパス"));
+ EXPECT_THAT(normalizer->NormalizeTerm("안녕하세요"), Eq("안녕하세요"));
+}
+
+TEST(MapNormalizerTest, UppercaseToLowercase) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("mdi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("Icing"), Eq("icing"));
+}
+
+TEST(MapNormalizerTest, LatinLetterRemoveAccent) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("zurich"));
+ EXPECT_THAT(normalizer->NormalizeTerm("après-midi"), Eq("apres-midi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("Buenos días"), Eq("buenos dias"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÀÁÂÃÄÅĀĂĄḀḁàáâãäåāăą"),
+ Eq("aaaaaaaaaaaaaaaaaaaa"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ḂḄḆḃḅḇ"), Eq("bbbbbb"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÇĆĈĊČḈḉćĉċčç"), Eq("cccccccccccc"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÐĎĐḊḌḎḐḒḋḍḏḑḓďđ"),
+ Eq("ddddddddddddddd"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÈÉÊËĒĔĖĘḔḖḘḚḜḕḗḙḛḝèéêëēĕėęě"),
+ Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
+ EXPECT_THAT(normalizer->NormalizeTerm("Ḟḟ"), Eq("ff"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
+ Eq("iiiiiiiiiiiiiiiii"));
+ EXPECT_THAT(normalizer->NormalizeTerm("Ĵĵ"), Eq("jj"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
+ Eq("nnnnnnnnnnnnnnnn"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŌŎŐÒÓÔÕÖṌṎṐṒṍṏṑṓòóôõöōŏő"),
+ Eq("oooooooooooooooooooooooo"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ṔṖṕṗ"), Eq("pppp"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŔŖŘṘṚṜṞṙṛṝṟŕŗř"),
+ Eq("rrrrrrrrrrrrrr"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŚŜŞŠȘṠṢṤṦṨṡṣṥṧṩșśŝşš"),
+ Eq("ssssssssssssssssssss"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŢŤȚṪṬṮṰṫṭṯṱțţť"),
+ Eq("tttttttttttttt"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŨŪŬÙÚÛÜṲṴṶṸṺṳṵṷṹṻùúûüũūŭ"),
+ Eq("uuuuuuuuuuuuuuuuuuuuuuuu"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ṼṾṽṿ"), Eq("vvvv"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz"));
+}
+
+// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
+// Japanese and Greek
+TEST(MapNormalizerTest, NonLatinLetterNotRemoveAccent) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Katakana
+ EXPECT_THAT(normalizer->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
+ // Greek
+ EXPECT_THAT(normalizer->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
+ EXPECT_THAT(normalizer->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
+ // Hebrew
+ EXPECT_THAT(normalizer->NormalizeTerm("אָלֶף־בֵּית עִבְרִי"), Eq("אָלֶף־בֵּית עִבְרִי"));
+}
+
+TEST(MapNormalizerTest, FullWidthCharsToASCII) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Full-width punctuation to ASCII punctuation
+ EXPECT_THAT(normalizer->NormalizeTerm("‘’.,!?:“”"), Eq("''.,!?:\"\""));
+ // Full-width 0-9
+ EXPECT_THAT(normalizer->NormalizeTerm("0123456789"),
+ Eq("0123456789"));
+ // Full-width A-Z
+ EXPECT_THAT(normalizer->NormalizeTerm(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
+ Eq("abcdefghijklmnopqrstuvwxyz"));
+ // Full-width a-z
+ EXPECT_THAT(normalizer->NormalizeTerm(
+ "abcdefghijklmnopqrstuvwxyz"),
+ Eq("abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST(MapNormalizerTest, IdeographicToASCII) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm(",。"), Eq(",."));
+}
+
+TEST(MapNormalizerTest, HiraganaToKatakana) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ EXPECT_THAT(normalizer->NormalizeTerm("あいうえお"), Eq("アイウエオ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("かきくけこ"), Eq("カキクケコ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("さしすせそ"), Eq("サシスセソ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("たちつてと"), Eq("タチツテト"));
+ EXPECT_THAT(normalizer->NormalizeTerm("なにぬねの"), Eq("ナニヌネノ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("はひふへほ"), Eq("ハヒフヘホ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("まみむめも"), Eq("マミムメモ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("やゆよ"), Eq("ヤユヨ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("らりるれろ"), Eq("ラリルレロ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("わゐゑを"), Eq("ワヰヱヲ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ん"), Eq("ン"));
+ EXPECT_THAT(normalizer->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ざじずぜぞ"), Eq("ザジズゼゾ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("だぢづでど"), Eq("ダヂヅデド"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ"));
+ EXPECT_THAT(normalizer->NormalizeTerm("ぱぴぷぺぽ"), Eq("パピプペポ"));
+}
+
+TEST(MapNormalizerTest, Truncate) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/5));
+
+ // Won't be truncated
+ EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
+ EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
+
+ // Truncated to length 5.
+ EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
+
+ // Each Japanese character has 3 bytes, so truncating to length 5 results in
+ // only 1 character.
+ EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ"));
+
+ // Each Greek character has 2 bytes, so truncating to length 5 results in 2
+ // character.
+ EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ"));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/2));
+ // The Japanese character has 3 bytes, truncating it results in an empty
+ // string.
+ EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq(""));
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc
new file mode 100644
index 0000000..c318036
--- /dev/null
+++ b/icing/transform/map/normalization-map.cc
@@ -0,0 +1,712 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/transform/map/normalization-map.h"
+
+#include <cstdint>
+#include "icing/legacy/core/icing-packed-pod.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+// A pair representing the mapping of the 'from' character to 'to' character.
+struct NormalizationPair {
+ // All the mapped characters can be stored in 2 bytes.
+ char16_t from;
+ char16_t to;
+} __attribute__((packed));
+
+// The following mappings contain multiple categories:
+// 1. Hiragana -> Katakana, listed in the order of Hiragana chart rows.
+// All regular and small Hiragana characters are mapped to Katakana. Note
+// that half-width Katakana characters are not handled here.
+// 2. Common full-width characters -> ASCII characters.
+// Full-width characters in the Unicode range of [0xff01, 0xff5e] are mapped
+// to the corresponding ASCII forms.
+// 3. Common ideographic punctuation marks -> ASCII characters.
+// Ideographic characters are in the Unicode range of [0x3000, 0x303f]. Here
+// we list two that are frequently used in CJK and can be converted to ASCII.
+// 4. Common diacritic Latin characters -> ASCII characters.
+// We list most diacritic Latin characters within the Unicode range of
+// [0x00c0, 0x017e], some from [0x01a0, 0x021b], and most from [0x1e00,
+// 0x1ef9].
+//
+// All the characters can be stored in a single UTF16 code unit, so we use
+// char16_t to store them. Size of the following array is about 2.5KiB.
+constexpr NormalizationPair kNormalizationMappings[] = {
+ // Part 1: Hiragana -> Katakana
+ // 'a' row
+ {0x3042, 0x30a2}, // Hiragana letter A -> Katakana letter A
+ {0x3044, 0x30a4}, // Hiragana letter I -> Katakana letter I
+ {0x3046, 0x30a6}, // Hiragana letter U -> Katakana letter U
+ {0x3048, 0x30a8}, // Hiragana letter E -> Katakana letter E
+ {0x304a, 0x30aa}, // Hiragana letter O -> Katakana letter O
+ {0x3041, 0x30a2}, // Hiragana letter small A -> Katakana letter A
+ {0x3043, 0x30a4}, // Hiragana letter small I -> Katakana letter I
+ {0x3045, 0x30a6}, // Hiragana letter small U -> Katakana letter U
+ {0x3047, 0x30a8}, // Hiragana letter small E -> Katakana letter E
+ {0x3049, 0x30aa}, // Hiragana letter small O -> Katakana letter O
+ // 'ka' row
+ {0x304b, 0x30ab}, // Hiragana letter KA -> Katakana letter KA
+ {0x304d, 0x30ad}, // Hiragana letter KI -> Katakana letter KI
+ {0x304f, 0x30af}, // Hiragana letter KU -> Katakana letter KU
+ {0x3051, 0x30b1}, // Hiragana letter KE -> Katakana letter KE
+ {0x3053, 0x30b3}, // Hiragana letter KO -> Katakana letter KO
+ {0x3095, 0x30ab}, // Hiragana letter small KA -> Katakana letter KA
+ {0x3096, 0x30b1}, // Hiragana letter small KE -> Katakana letter KE
+ // 'sa' row
+ {0x3055, 0x30b5}, // Hiragana letter SA -> Katakana letter SA
+ {0x3057, 0x30b7}, // Hiragana letter SI -> Katakana letter SI
+ {0x3059, 0x30b9}, // Hiragana letter SU -> Katakana letter SU
+ {0x305b, 0x30bb}, // Hiragana letter SE -> Katakana letter SE
+ {0x305d, 0x30bd}, // Hiragana letter SO -> Katakana letter SO
+ // 'ta' row
+ {0x305f, 0x30bf}, // Hiragana letter TA -> Katakana letter TA
+ {0x3061, 0x30c1}, // Hiragana letter TI -> Katakana letter TI
+ {0x3063, 0x30c4}, // Hiragana letter small TU -> Katakana letter TU
+ {0x3064, 0x30c4}, // Hiragana letter TU -> Katakana letter TU
+ {0x3066, 0x30c6}, // Hiragana letter TE -> Katakana letter TE
+ {0x3068, 0x30c8}, // Hiragana letter TO -> Katakana letter TO
+ // 'na' row
+ {0x306a, 0x30ca}, // Hiragana letter NA -> Katakana letter NA
+ {0x306b, 0x30cb}, // Hiragana letter NI -> Katakana letter NI
+ {0x306c, 0x30cc}, // Hiragana letter NU -> Katakana letter NU
+ {0x306d, 0x30cd}, // Hiragana letter NE -> Katakana letter NE
+ {0x306e, 0x30ce}, // Hiragana letter NO -> Katakana letter NO
+ // 'ha' row
+ {0x306f, 0x30cf}, // Hiragana letter HA -> Katakana letter HA
+ {0x3072, 0x30d2}, // Hiragana letter HI -> Katakana letter HI
+ {0x3075, 0x30d5}, // Hiragana letter HU -> Katakana letter HU
+ {0x3078, 0x30d8}, // Hiragana letter HE -> Katakana letter HE
+ {0x307b, 0x30db}, // Hiragana letter HO -> Katakana letter HO
+ // 'ma' row
+ {0x307e, 0x30de}, // Hiragana letter MA -> Katakana letter MA
+ {0x307f, 0x30df}, // Hiragana letter MI -> Katakana letter MI
+ {0x3080, 0x30e0}, // Hiragana letter MU -> Katakana letter MU
+ {0x3081, 0x30e1}, // Hiragana letter ME -> Katakana letter ME
+ {0x3082, 0x30e2}, // Hiragana letter MO -> Katakana letter MO
+ // 'ya' row
+ {0x3083, 0x30e4}, // Hiragana letter small YA -> Katakana letter YA
+ {0x3084, 0x30e4}, // Hiragana letter YA -> Katakana letter YA
+ {0x3085, 0x30e6}, // Hiragana letter small YU -> Katakana letter YU
+ {0x3086, 0x30e6}, // Hiragana letter YU -> Katakana letter YU
+ {0x3087, 0x30e8}, // Hiragana letter small YO -> Katakana letter YO
+ {0x3088, 0x30e8}, // Hiragana letter YO -> Katakana letter YO
+ // 'ra' row
+ {0x3089, 0x30e9}, // Hiragana letter RA -> Katakana letter RA
+ {0x308a, 0x30ea}, // Hiragana letter RI -> Katakana letter RI
+ {0x308b, 0x30eb}, // Hiragana letter RU -> Katakana letter RU
+ {0x308c, 0x30ec}, // Hiragana letter RE -> Katakana letter RE
+ {0x308d, 0x30ed}, // Hiragana letter RO -> Katakana letter RO
+ // 'wa' row
+ {0x308e, 0x30ef}, // Hiragana letter small WA -> Katakana letter WA
+ {0x308f, 0x30ef}, // Hiragana letter WA -> Katakana letter WA
+ {0x3090, 0x30f0}, // Hiragana letter WI -> Katakana letter WI
+ {0x3091, 0x30f1}, // Hiragana letter WE -> Katakana letter WE
+ {0x3092, 0x30f2}, // Hiragana letter WO -> Katakana letter WO
+ // 'n'
+ {0x3093, 0x30f3}, // Hiragana letter N -> Katakana letter N
+ // 'ga' row
+ {0x304c, 0x30ac}, // Hiragana letter GA -> Katakana letter GA
+ {0x304e, 0x30ae}, // Hiragana letter GI -> Katakana letter GI
+ {0x3050, 0x30b0}, // Hiragana letter GU -> Katakana letter GU
+ {0x3052, 0x30b2}, // Hiragana letter GE -> Katakana letter GE
+ {0x3054, 0x30b4}, // Hiragana letter GO -> Katakana letter GO
+ // 'za' row
+ {0x3056, 0x30b6}, // Hiragana letter ZA -> Katakana letter ZA
+ {0x3058, 0x30b8}, // Hiragana letter ZI -> Katakana letter ZI
+ {0x305a, 0x30ba}, // Hiragana letter ZU -> Katakana letter ZU
+ {0x305c, 0x30bc}, // Hiragana letter ZE -> Katakana letter ZE
+ {0x305e, 0x30be}, // Hiragana letter ZO -> Katakana letter ZO
+ // 'da' row
+ {0x3060, 0x30c0}, // Hiragana letter DA -> Katakana letter DA
+ {0x3062, 0x30c2}, // Hiragana letter DI -> Katakana letter DI
+ {0x3065, 0x30c5}, // Hiragana letter DU -> Katakana letter DU
+ {0x3067, 0x30c7}, // Hiragana letter DE -> Katakana letter DE
+ {0x3069, 0x30c9}, // Hiragana letter DO -> Katakana letter DO
+ // 'ba' row
+ {0x3070, 0x30d0}, // Hiragana letter BA -> Katakana letter BA
+ {0x3073, 0x30d3}, // Hiragana letter BI -> Katakana letter BI
+ {0x3076, 0x30d6}, // Hiragana letter BU -> Katakana letter BU
+ {0x3079, 0x30d9}, // Hiragana letter BE -> Katakana letter BE
+ {0x307c, 0x30dc}, // Hiragana letter BO -> Katakana letter BO
+ // 'pa' row
+ {0x3071, 0x30d1}, // Hiragana letter PA -> Katakana letter PA
+ {0x3074, 0x30d4}, // Hiragana letter PI -> Katakana letter PI
+ {0x3077, 0x30d7}, // Hiragana letter PU -> Katakana letter PU
+ {0x307a, 0x30da}, // Hiragana letter PE -> Katakana letter PE
+ {0x307d, 0x30dd}, // Hiragana letter PO -> Katakana letter PO
+ // Additional Hiragana
+ {0x3094, 0x30f4}, // Hiragana letter VU -> Katakana letter VU
+ // Part 2: Common full-width characters -> ASCII characters.
+ {0xff01, 33}, // ASCII !
+ {0xff02, 34}, // ASCII "
+ {0xff03, 35}, // ASCII #
+ {0xff04, 36}, // ASCII $
+ {0xff05, 37}, // ASCII %
+ {0xff06, 38}, // ASCII &
+ {0xff07, 39}, // ASCII '
+ {0xff08, 40}, // ASCII (
+ {0xff09, 41}, // ASCII )
+ {0xff0a, 42}, // ASCII *
+ {0xff0b, 43}, // ASCII +
+ {0xff0c, 44}, // ASCII ,
+ {0xff0d, 45}, // ASCII -
+ {0xff0e, 46}, // ASCII .
+ {0xff0f, 47}, // ASCII /
+ {0xff10, 48}, // ASCII 0
+ {0xff11, 49}, // ASCII 1
+ {0xff12, 50}, // ASCII 2
+ {0xff13, 51}, // ASCII 3
+ {0xff14, 52}, // ASCII 4
+ {0xff15, 53}, // ASCII 5
+ {0xff16, 54}, // ASCII 6
+ {0xff17, 55}, // ASCII 7
+ {0xff18, 56}, // ASCII 8
+ {0xff19, 57}, // ASCII 9
+ {0xff1a, 58}, // ASCII :
+ {0xff1b, 59}, // ASCII ;
+ {0xff1c, 60}, // ASCII <
+ {0xff1d, 61}, // ASCII =
+ {0xff1e, 62}, // ASCII >
+ {0xff1f, 63}, // ASCII ?
+ {0xff20, 64}, // ASCII @
+ {0xff21, 65}, // ASCII A
+ {0xff22, 66}, // ASCII B
+ {0xff23, 67}, // ASCII C
+ {0xff24, 68}, // ASCII D
+ {0xff25, 69}, // ASCII E
+ {0xff26, 70}, // ASCII F
+ {0xff27, 71}, // ASCII G
+ {0xff28, 72}, // ASCII H
+ {0xff29, 73}, // ASCII I
+ {0xff2a, 74}, // ASCII J
+ {0xff2b, 75}, // ASCII K
+ {0xff2c, 76}, // ASCII L
+ {0xff2d, 77}, // ASCII M
+ {0xff2e, 78}, // ASCII N
+ {0xff2f, 79}, // ASCII O
+ {0xff30, 80}, // ASCII P
+ {0xff31, 81}, // ASCII Q
+ {0xff32, 82}, // ASCII R
+ {0xff33, 83}, // ASCII S
+ {0xff34, 84}, // ASCII T
+ {0xff35, 85}, // ASCII U
+ {0xff36, 86}, // ASCII V
+ {0xff37, 87}, // ASCII W
+ {0xff38, 88}, // ASCII X
+ {0xff39, 89}, // ASCII Y
+ {0xff3a, 90}, // ASCII Z
+ {0xff3b, 91}, // ASCII [
+ {0xff3c, 92}, // ASCII forward slash
+ {0xff3d, 93}, // ASCII ]
+ {0xff3e, 94}, // ASCII ^
+ {0xff3f, 95}, // ASCII _
+ {0xff40, 96}, // ASCII `
+ {0xff41, 97}, // ASCII a
+ {0xff42, 98}, // ASCII b
+ {0xff43, 99}, // ASCII c
+ {0xff44, 100}, // ASCII d
+ {0xff45, 101}, // ASCII e
+ {0xff46, 102}, // ASCII f
+ {0xff47, 103}, // ASCII g
+ {0xff48, 104}, // ASCII h
+ {0xff49, 105}, // ASCII i
+ {0xff4a, 106}, // ASCII j
+ {0xff4b, 107}, // ASCII k
+ {0xff4c, 108}, // ASCII l
+ {0xff4d, 109}, // ASCII m
+ {0xff4e, 110}, // ASCII n
+ {0xff4f, 111}, // ASCII o
+ {0xff50, 112}, // ASCII p
+ {0xff51, 113}, // ASCII q
+ {0xff52, 114}, // ASCII r
+ {0xff53, 115}, // ASCII s
+ {0xff54, 116}, // ASCII t
+ {0xff55, 117}, // ASCII u
+ {0xff56, 118}, // ASCII v
+ {0xff57, 119}, // ASCII w
+ {0xff58, 120}, // ASCII x
+ {0xff59, 121}, // ASCII y
+ {0xff5a, 122}, // ASCII z
+ {0xff5b, 123}, // ASCII {
+ {0xff5c, 124}, // ASCII |
+ {0xff5d, 125}, // ASCII }
+ {0xff5e, 126}, // ASCII ~
+ {0x2018, 39}, // Left single quote -> ASCII apostrophe
+ {0x2019, 39}, // Right single quote -> ASCII apostrophe
+ {0x201c, 34}, // Left double quote -> ASCII quote
+ {0x201d, 34}, // Right double quote -> ASCII quote
+ // Part 3: Common ideographic punctuation marks -> ASCII.
+ // Usually used in CJK.
+ {0x3001, 44}, // ASCII ,
+ {0x3002, 46}, // ASCII .
+ // Part 4: Common diacritic Latin characters -> ASCII characters.
+ {0x00c0, 65}, // À -> A
+ {0x00c1, 65}, // Á -> A
+ {0x00c2, 65}, // Â -> A
+ {0x00c3, 65}, // Ã -> A
+ {0x00c4, 65}, // Ä -> A
+ {0x00c5, 65}, // Å -> A
+ {0x00c7, 67}, // Ç -> C
+ {0x00c8, 69}, // È -> E
+ {0x00c9, 69}, // É -> E
+ {0x00ca, 69}, // Ê -> E
+ {0x00cb, 69}, // Ë -> E
+ {0x00cc, 73}, // Ì -> I
+ {0x00cd, 73}, // Í -> I
+ {0x00ce, 73}, // Î -> I
+ {0x00cf, 73}, // Ï -> I
+ {0x00d0, 68}, // Ð -> D
+ {0x00d1, 78}, // Ñ -> N
+ {0x00d2, 79}, // Ò -> O
+ {0x00d3, 79}, // Ó -> O
+ {0x00d4, 79}, // Ô -> O
+ {0x00d5, 79}, // Õ -> O
+ {0x00d6, 79}, // Ö -> O
+ {0x00d8, 79}, // Ø -> O
+ {0x00d9, 85}, // Ù -> U
+ {0x00da, 85}, // Ú -> U
+ {0x00db, 85}, // Û -> U
+ {0x00dc, 85}, // Ü -> U
+ {0x00dd, 89}, // Ý -> Y
+ {0x00e0, 97}, // à -> a
+ {0x00e1, 97}, // á -> a
+ {0x00e2, 97}, // â -> a
+ {0x00e3, 97}, // ã -> a
+ {0x00e4, 97}, // ä -> a
+ {0x00e5, 97}, // å -> a
+ {0x00e7, 99}, // ç -> c
+ {0x00e8, 101}, // è -> e
+ {0x00e9, 101}, // é -> e
+ {0x00ea, 101}, // ê -> e
+ {0x00eb, 101}, // ë -> e
+ {0x00ec, 105}, // ì -> i
+ {0x00ed, 105}, // í -> i
+ {0x00ee, 105}, // î -> i
+ {0x00ef, 105}, // ï -> i
+ {0x00f0, 100}, // ð -> d
+ {0x00f1, 110}, // ñ -> n
+ {0x00f2, 111}, // ò -> o
+ {0x00f3, 111}, // ó -> o
+ {0x00f4, 111}, // ô -> o
+ {0x00f5, 111}, // õ -> o
+ {0x00f6, 111}, // ö -> o
+ {0x00f8, 111}, // ø -> o
+ {0x00f9, 117}, // ù -> u
+ {0x00fa, 117}, // ú -> u
+ {0x00fb, 117}, // û -> u
+ {0x00fc, 117}, // ü -> u
+ {0x00fd, 121}, // ý -> y
+ {0x00ff, 121}, // ÿ -> y
+ {0x0100, 65}, // Ā -> A
+ {0x0101, 97}, // ā -> a
+ {0x0102, 65}, // Ă -> A
+ {0x0103, 97}, // ă -> a
+ {0x0104, 65}, // Ą -> A
+ {0x0105, 97}, // ą -> a
+ {0x0106, 67}, // Ć -> C
+ {0x0107, 99}, // ć -> c
+ {0x0108, 67}, // Ĉ -> C
+ {0x0109, 99}, // ĉ -> c
+ {0x010a, 67}, // Ċ -> C
+ {0x010b, 99}, // ċ -> c
+ {0x010c, 67}, // Č -> C
+ {0x010d, 99}, // č -> c
+ {0x010e, 68}, // Ď -> D
+ {0x010f, 100}, // ď -> d
+ {0x0110, 68}, // Đ -> D
+ {0x0111, 100}, // đ -> d
+ {0x0112, 69}, // Ē -> E
+ {0x0113, 101}, // ē -> e
+ {0x0114, 69}, // Ĕ -> E
+ {0x0115, 101}, // ĕ -> e
+ {0x0116, 69}, // Ė -> E
+ {0x0117, 101}, // ė -> e
+ {0x0118, 69}, // Ę -> E
+ {0x0119, 101}, // ę -> e
+ {0x011a, 69}, // Ě -> E
+ {0x011b, 101}, // ě -> e
+ {0x011c, 71}, // Ĝ -> G
+ {0x011d, 103}, // ĝ -> g
+ {0x011e, 71}, // Ğ -> G
+ {0x011f, 103}, // ğ -> g
+ {0x0120, 71}, // Ġ -> G
+ {0x0121, 103}, // ġ -> g
+ {0x0122, 71}, // Ģ -> G
+ {0x0123, 103}, // ģ -> g
+ {0x0124, 72}, // Ĥ -> H
+ {0x0125, 104}, // ĥ -> h
+ {0x0126, 72}, // Ħ -> H
+ {0x0127, 104}, // ħ -> h
+ {0x0128, 73}, // Ĩ -> I
+ {0x0129, 105}, // ĩ -> i
+ {0x012a, 73}, // Ī -> I
+ {0x012b, 105}, // ī -> i
+ {0x012c, 73}, // Ĭ -> I
+ {0x012d, 105}, // ĭ -> i
+ {0x012e, 73}, // Į -> I
+ {0x012f, 105}, // į -> i
+ {0x0130, 73}, // İ -> I
+ {0x0131, 105}, // ı -> i
+ {0x0134, 74}, // Ĵ -> J
+ {0x0135, 106}, // ĵ -> j
+ {0x0136, 75}, // Ķ -> K
+ {0x0137, 107}, // ķ -> k
+ {0x0139, 76}, // Ĺ -> L
+ {0x013a, 108}, // ĺ -> l
+ {0x013b, 76}, // Ļ -> L
+ {0x013c, 108}, // ļ -> l
+ {0x013d, 76}, // Ľ -> L
+ {0x013e, 108}, // ľ -> l
+ {0x013f, 76}, // Ŀ -> L
+ {0x0140, 108}, // ŀ -> l
+ {0x0141, 76}, // Ł -> L
+ {0x0142, 108}, // ł -> l
+ {0x0143, 78}, // Ń -> N
+ {0x0144, 110}, // ń -> n
+ {0x0145, 78}, // Ņ -> N
+ {0x0146, 110}, // ņ -> n
+ {0x0147, 78}, // Ň -> N
+ {0x0148, 110}, // ň -> n
+ {0x014a, 78}, // Ŋ -> N
+ {0x014b, 110}, // ŋ -> n
+ {0x014c, 79}, // Ō -> O
+ {0x014d, 111}, // ō -> o
+ {0x014e, 79}, // Ŏ -> O
+ {0x014f, 111}, // ŏ -> o
+ {0x0150, 79}, // Ő -> O
+ {0x0151, 111}, // ő -> o
+ {0x0154, 82}, // Ŕ -> R
+ {0x0155, 114}, // ŕ -> r
+ {0x0156, 82}, // Ŗ -> R
+ {0x0157, 114}, // ŗ -> r
+ {0x0158, 82}, // Ř -> R
+ {0x0159, 114}, // ř -> r
+ {0x015a, 83}, // Ś -> S
+ {0x015b, 115}, // ś -> s
+ {0x015c, 83}, // Ŝ -> S
+ {0x015d, 115}, // ŝ -> s
+ {0x015e, 83}, // Ş -> S
+ {0x015f, 115}, // ş -> s
+ {0x0160, 83}, // Š -> S
+ {0x0161, 115}, // š -> s
+ {0x0162, 84}, // Ţ -> T
+ {0x0163, 116}, // ţ -> t
+ {0x0164, 84}, // Ť -> T
+ {0x0165, 116}, // ť -> t
+ {0x0166, 84}, // Ŧ -> T
+ {0x0167, 116}, // ŧ -> t
+ {0x0168, 85}, // Ũ -> U
+ {0x0169, 117}, // ũ -> u
+ {0x016a, 85}, // Ū -> U
+ {0x016b, 117}, // ū -> u
+ {0x016c, 85}, // Ŭ -> U
+ {0x016d, 117}, // ŭ -> u
+ {0x016e, 85}, // Ů -> U
+ {0x016f, 117}, // ů -> u
+ {0x0170, 85}, // Ű -> U
+ {0x0171, 117}, // ű -> u
+ {0x0172, 85}, // Ų -> U
+ {0x0173, 117}, // ų -> u
+ {0x0174, 87}, // Ŵ -> W
+ {0x0175, 119}, // ŵ -> w
+ {0x0176, 89}, // Ŷ -> Y
+ {0x0177, 121}, // ŷ -> y
+ {0x0178, 89}, // Ÿ -> Y
+ {0x0179, 90}, // Ź -> Z
+ {0x017a, 122}, // ź -> z
+ {0x017b, 90}, // Ż -> Z
+ {0x017c, 122}, // ż -> z
+ {0x017d, 90}, // Ž -> Z
+ {0x017e, 122}, // ž -> z
+ {0x01a0, 79}, // Ơ -> O
+ {0x01a1, 111}, // ơ -> o
+ {0x01af, 85}, // Ư -> U
+ {0x01b0, 117}, // ư -> u
+ {0x01b5, 90}, // Ƶ -> Z
+ {0x01b6, 122}, // ƶ -> z
+ {0x0218, 83}, // Ș -> S
+ {0x0219, 115}, // ș -> s
+ {0x021a, 84}, // Ț -> T
+ {0x021b, 116}, // ț -> t
+ {0x1e00, 65}, // Ḁ -> A
+ {0x1e01, 97}, // ḁ -> a
+ {0x1e02, 66}, // Ḃ -> B
+ {0x1e03, 98}, // ḃ -> b
+ {0x1e04, 66}, // Ḅ -> B
+ {0x1e05, 98}, // ḅ -> b
+ {0x1e06, 66}, // Ḇ -> B
+ {0x1e07, 98}, // ḇ -> b
+ {0x1e08, 67}, // Ḉ -> C
+ {0x1e09, 99}, // ḉ -> c
+ {0x1e0a, 68}, // Ḋ -> D
+ {0x1e0b, 100}, // ḋ -> d
+ {0x1e0c, 68}, // Ḍ -> D
+ {0x1e0d, 100}, // ḍ -> d
+ {0x1e0e, 68}, // Ḏ -> D
+ {0x1e0f, 100}, // ḏ -> d
+ {0x1e10, 68}, // Ḑ -> D
+ {0x1e11, 100}, // ḑ -> d
+ {0x1e12, 68}, // Ḓ -> D
+ {0x1e13, 100}, // ḓ -> d
+ {0x1e14, 69}, // Ḕ -> E
+ {0x1e15, 101}, // ḕ -> e
+ {0x1e16, 69}, // Ḗ -> E
+ {0x1e17, 101}, // ḗ -> e
+ {0x1e18, 69}, // Ḙ -> E
+ {0x1e19, 101}, // ḙ -> e
+ {0x1e1a, 69}, // Ḛ -> E
+ {0x1e1b, 101}, // ḛ -> e
+ {0x1e1c, 69}, // Ḝ -> E
+ {0x1e1d, 101}, // ḝ -> e
+ {0x1e1e, 70}, // Ḟ -> F
+ {0x1e1f, 102}, // ḟ -> f
+ {0x1e20, 71}, // Ḡ -> G
+ {0x1e21, 103}, // ḡ -> g
+ {0x1e22, 72}, // Ḣ -> H
+ {0x1e23, 104}, // ḣ -> h
+ {0x1e24, 72}, // Ḥ -> H
+ {0x1e25, 104}, // ḥ -> h
+ {0x1e26, 72}, // Ḧ -> H
+ {0x1e27, 104}, // ḧ -> h
+ {0x1e28, 72}, // Ḩ -> H
+ {0x1e29, 104}, // ḩ -> h
+ {0x1e2a, 72}, // Ḫ -> H
+ {0x1e2b, 104}, // ḫ -> h
+ {0x1e2c, 73}, // Ḭ -> I
+ {0x1e2d, 105}, // ḭ -> i
+ {0x1e2e, 73}, // Ḯ -> I
+ {0x1e2f, 105}, // ḯ -> i
+ {0x1e30, 75}, // Ḱ -> K
+ {0x1e31, 107}, // ḱ -> k
+ {0x1e32, 75}, // Ḳ -> K
+ {0x1e33, 107}, // ḳ -> k
+ {0x1e34, 75}, // Ḵ -> K
+ {0x1e35, 107}, // ḵ -> k
+ {0x1e36, 76}, // Ḷ -> L
+ {0x1e37, 108}, // ḷ -> l
+ {0x1e38, 76}, // Ḹ -> L
+ {0x1e39, 108}, // ḹ -> l
+ {0x1e3b, 108}, // ḻ -> l
+ {0x1e3c, 76}, // Ḽ -> L
+ {0x1e3d, 108}, // ḽ -> l
+ {0x1e3e, 77}, // Ḿ -> M
+ {0x1e3f, 109}, // ḿ -> m
+ {0x1e40, 77}, // Ṁ -> M
+ {0x1e41, 109}, // ṁ -> m
+ {0x1e42, 77}, // Ṃ -> M
+ {0x1e43, 109}, // ṃ -> m
+ {0x1e44, 78}, // Ṅ -> N
+ {0x1e45, 110}, // ṅ -> n
+ {0x1e46, 78}, // Ṇ -> N
+ {0x1e47, 110}, // ṇ -> n
+ {0x1e48, 78}, // Ṉ -> N
+ {0x1e49, 110}, // ṉ -> n
+ {0x1e4a, 78}, // Ṋ -> N
+ {0x1e4b, 110}, // ṋ -> n
+ {0x1e4c, 79}, // Ṍ -> O
+ {0x1e4d, 111}, // ṍ -> o
+ {0x1e4e, 79}, // Ṏ -> O
+ {0x1e4f, 111}, // ṏ -> o
+ {0x1e50, 79}, // Ṑ -> O
+ {0x1e51, 111}, // ṑ -> o
+ {0x1e52, 79}, // Ṓ -> O
+ {0x1e53, 111}, // ṓ -> o
+ {0x1e54, 80}, // Ṕ -> P
+ {0x1e55, 112}, // ṕ -> p
+ {0x1e56, 80}, // Ṗ -> P
+ {0x1e57, 112}, // ṗ -> p
+ {0x1e58, 82}, // Ṙ -> R
+ {0x1e59, 114}, // ṙ -> r
+ {0x1e5a, 82}, // Ṛ -> R
+ {0x1e5b, 114}, // ṛ -> r
+ {0x1e5c, 82}, // Ṝ -> R
+ {0x1e5d, 114}, // ṝ -> r
+ {0x1e5e, 82}, // Ṟ -> R
+ {0x1e5f, 114}, // ṟ -> r
+ {0x1e60, 83}, // Ṡ -> S
+ {0x1e61, 115}, // ṡ -> s
+ {0x1e62, 83}, // Ṣ -> S
+ {0x1e63, 115}, // ṣ -> s
+ {0x1e64, 83}, // Ṥ -> S
+ {0x1e65, 115}, // ṥ -> s
+ {0x1e66, 83}, // Ṧ -> S
+ {0x1e67, 115}, // ṧ -> s
+ {0x1e68, 83}, // Ṩ -> S
+ {0x1e69, 115}, // ṩ -> s
+ {0x1e6a, 84}, // Ṫ -> T
+ {0x1e6b, 116}, // ṫ -> t
+ {0x1e6c, 84}, // Ṭ -> T
+ {0x1e6d, 116}, // ṭ -> t
+ {0x1e6e, 84}, // Ṯ -> T
+ {0x1e6f, 116}, // ṯ -> t
+ {0x1e70, 84}, // Ṱ -> T
+ {0x1e71, 116}, // ṱ -> t
+ {0x1e72, 85}, // Ṳ -> U
+ {0x1e73, 117}, // ṳ -> u
+ {0x1e74, 85}, // Ṵ -> U
+ {0x1e75, 117}, // ṵ -> u
+ {0x1e76, 85}, // Ṷ -> U
+ {0x1e77, 117}, // ṷ -> u
+ {0x1e78, 85}, // Ṹ -> U
+ {0x1e79, 117}, // ṹ -> u
+ {0x1e7a, 85}, // Ṻ -> U
+ {0x1e7b, 117}, // ṻ -> u
+ {0x1e7c, 86}, // Ṽ -> V
+ {0x1e7d, 118}, // ṽ -> v
+ {0x1e7e, 86}, // Ṿ -> V
+ {0x1e7f, 118}, // ṿ -> v
+ {0x1e80, 87}, // Ẁ -> W
+ {0x1e81, 119}, // ẁ -> w
+ {0x1e82, 87}, // Ẃ -> W
+ {0x1e83, 119}, // ẃ -> w
+ {0x1e84, 87}, // Ẅ -> W
+ {0x1e85, 119}, // ẅ -> w
+ {0x1e86, 87}, // Ẇ -> W
+ {0x1e87, 119}, // ẇ -> w
+ {0x1e88, 87}, // Ẉ -> W
+ {0x1e89, 119}, // ẉ -> w
+ {0x1e8a, 88}, // Ẋ -> X
+ {0x1e8b, 120}, // ẋ -> x
+ {0x1e8c, 88}, // Ẍ -> X
+ {0x1e8d, 120}, // ẍ -> x
+ {0x1e8e, 89}, // Ẏ -> Y
+ {0x1e8f, 121}, // ẏ -> y
+ {0x1e90, 90}, // Ẑ -> Z
+ {0x1e91, 122}, // ẑ -> z
+ {0x1e92, 90}, // Ẓ -> Z
+ {0x1e93, 122}, // ẓ -> z
+ {0x1e94, 90}, // Ẕ -> Z
+ {0x1e95, 122}, // ẕ -> z
+ {0x1e96, 104}, // ẖ -> h
+ {0x1e97, 116}, // ẗ -> t
+ {0x1e98, 119}, // ẘ -> w
+ {0x1e99, 121}, // ẙ -> y
+ {0x1e9a, 97}, // ẚ -> a
+ {0x1e9b, 102}, // ẛ -> f
+ {0x1ea0, 65}, // Ạ -> A
+ {0x1ea1, 97}, // ạ -> a
+ {0x1ea2, 65}, // Ả -> A
+ {0x1ea3, 97}, // ả -> a
+ {0x1ea4, 65}, // Ấ -> A
+ {0x1ea5, 97}, // ấ -> a
+ {0x1ea6, 65}, // Ầ -> A
+ {0x1ea7, 97}, // ầ -> a
+ {0x1ea8, 65}, // Ẩ -> A
+ {0x1ea9, 97}, // ẩ -> a
+ {0x1eaa, 65}, // Ẫ -> A
+ {0x1eab, 97}, // ẫ -> a
+ {0x1eac, 65}, // Ậ -> A
+ {0x1ead, 97}, // ậ -> a
+ {0x1eae, 65}, // Ắ -> A
+ {0x1eaf, 97}, // ắ -> a
+ {0x1eb0, 65}, // Ằ -> A
+ {0x1eb1, 97}, // ằ -> a
+ {0x1eb2, 65}, // Ẳ -> A
+ {0x1eb3, 97}, // ẳ -> a
+ {0x1eb4, 65}, // Ẵ -> A
+ {0x1eb5, 97}, // ẵ -> a
+ {0x1eb6, 65}, // Ặ -> A
+ {0x1eb7, 97}, // ặ -> a
+ {0x1eb8, 69}, // Ẹ -> E
+ {0x1eb9, 101}, // ẹ -> e
+ {0x1eba, 69}, // Ẻ -> E
+ {0x1ebb, 101}, // ẻ -> e
+ {0x1ebc, 69}, // Ẽ -> E
+ {0x1ebd, 101}, // ẽ -> e
+ {0x1ebe, 69}, // Ế -> E
+ {0x1ebf, 101}, // ế -> e
+ {0x1ec0, 69}, // Ề -> E
+ {0x1ec1, 101}, // ề -> e
+ {0x1ec2, 69}, // Ể -> E
+ {0x1ec3, 101}, // ể -> e
+ {0x1ec4, 69}, // Ễ -> E
+ {0x1ec5, 101}, // ễ -> e
+ {0x1ec6, 69}, // Ệ -> E
+ {0x1ec7, 101}, // ệ -> e
+ {0x1ec8, 73}, // Ỉ -> I
+ {0x1ec9, 105}, // ỉ -> i
+ {0x1eca, 73}, // Ị -> I
+ {0x1ecb, 105}, // ị -> i
+ {0x1ecc, 79}, // Ọ -> O
+ {0x1ecd, 111}, // ọ -> o
+ {0x1ece, 79}, // Ỏ -> O
+ {0x1ecf, 111}, // ỏ -> o
+ {0x1ed0, 79}, // Ố -> O
+ {0x1ed1, 111}, // ố -> o
+ {0x1ed2, 79}, // Ồ -> O
+ {0x1ed3, 111}, // ồ -> o
+ {0x1ed4, 79}, // Ổ -> O
+ {0x1ed5, 111}, // ổ -> o
+ {0x1ed6, 79}, // Ỗ -> O
+ {0x1ed7, 111}, // ỗ -> o
+ {0x1ed8, 79}, // Ộ -> O
+ {0x1ed9, 111}, // ộ -> o
+ {0x1eda, 79}, // Ớ -> O
+ {0x1edb, 111}, // ớ -> o
+ {0x1edc, 79}, // Ờ -> O
+ {0x1edd, 111}, // ờ -> o
+ {0x1ede, 79}, // Ở -> O
+ {0x1edf, 111}, // ở -> o
+ {0x1ee0, 79}, // Ỡ -> O
+ {0x1ee1, 111}, // ỡ -> o
+ {0x1ee2, 79}, // Ợ -> O
+ {0x1ee3, 111}, // ợ -> o
+ {0x1ee4, 85}, // Ụ -> U
+ {0x1ee5, 117}, // ụ -> u
+ {0x1ee6, 85}, // Ủ -> U
+ {0x1ee7, 117}, // ủ -> u
+ {0x1ee8, 85}, // Ứ -> U
+ {0x1ee9, 117}, // ứ -> u
+ {0x1eea, 85}, // Ừ -> U
+ {0x1eeb, 117}, // ừ -> u
+ {0x1eec, 85}, // Ử -> U
+ {0x1eed, 117}, // ử -> u
+ {0x1eee, 85}, // Ữ -> U
+ {0x1eef, 117}, // ữ -> u
+ {0x1ef0, 85}, // Ự -> U
+ {0x1ef1, 117}, // ự -> u
+ {0x1ef2, 89}, // Ỳ -> Y
+ {0x1ef3, 121}, // ỳ -> y
+ {0x1ef4, 89}, // Ỵ -> Y
+ {0x1ef5, 121}, // ỵ -> y
+ {0x1ef6, 89}, // Ỷ -> Y
+ {0x1ef7, 121}, // ỷ -> y
+ {0x1ef8, 89}, // Ỹ -> Y
+ {0x1ef9, 121}, // ỹ -> y
+};
+
+} // namespace
+
+const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() {
+ // The map is allocated dynamically the first time this function is executed.
+ static const std::unordered_map<char16_t, char16_t> normalization_map = [] {
+ std::unordered_map<char16_t, char16_t> map;
+ // Size of all the mappings is about 2.5 KiB.
+ constexpr int numMappings =
+ sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
+ map.reserve(numMappings);
+ for (size_t i = 0; i < numMappings; ++i) {
+ map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to);
+ }
+ return map;
+ }();
+
+ return normalization_map;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/icu-i18n-utils_test.cc b/icing/transform/map/normalization-map.h
index f5864df..aea85bd 100644
--- a/icing/util/icu-i18n-utils_test.cc
+++ b/icing/transform/map/normalization-map.h
@@ -12,31 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/util/icu-i18n-utils.h"
+#ifndef ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
+#define ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
-#include <memory>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "unicode/uchar.h"
+#include <unordered_map>
namespace icing {
namespace lib {
-namespace icu_i18n_utils {
-namespace {
-
-TEST(IcuI18nUtilsTest, IsPunctuationAtSameAsIcuIsPunct) {
- // Iterate through ASCII values
- for (int i = 0; i <= 127; ++i) {
- char ascii = i;
- std::string ascii_string = "";
- ascii_string.push_back(ascii);
+// Returns a map containing normalization mappings. A mapping (A -> B) means
+// that we'll transform every character 'A' into 'B'. See normalization-map.cc
+// for mapping details.
+const std::unordered_map<char16_t, char16_t>& GetNormalizationMap();
- EXPECT_EQ(IsPunctuationAt(ascii_string, /*position=*/0), u_ispunct(ascii));
- }
-}
-} // namespace
-} // namespace icu_i18n_utils
} // namespace lib
} // namespace icing
+
+#endif // ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_
diff --git a/icing/transform/normalizer-factory.h b/icing/transform/normalizer-factory.h
index 9119897..f1f3f62 100644
--- a/icing/transform/normalizer-factory.h
+++ b/icing/transform/normalizer-factory.h
@@ -16,12 +16,9 @@
#define ICING_TRANSFORM_NORMALIZER_FACTORY_H_
#include <memory>
-#include <string_view>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
-#include "icing/transform/icu-normalizer.h"
-#include "icing/transform/none-normalizer.h"
#include "icing/transform/normalizer.h"
namespace icing {
@@ -29,11 +26,6 @@ namespace lib {
namespace normalizer_factory {
-enum NormalizerType {
- ICU4C, // Normalizes using the ICU library.
- NONE, // Doesn't perform normalization. Not for use in production.
-};
-
// Creates a normalizer. max_term_byte_size enforces the max size of text after
// normalization, text will be truncated if exceeds the max size.
//
@@ -42,19 +34,7 @@ enum NormalizerType {
// INVALID_ARGUMENT if max_term_byte_size <= 0
// INTERNAL_ERROR on errors
libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
- NormalizerType type, int max_term_byte_size) {
- if (max_term_byte_size <= 0) {
- return absl_ports::InvalidArgumentError(
- "max_term_byte_size must be greater than zero.");
- }
-
- switch (type) {
- case ICU4C:
- return IcuNormalizer::Create(max_term_byte_size);
- case NONE:
- return std::make_unique<NoneNormalizer>(max_term_byte_size);
- }
-}
+ int max_term_byte_size);
} // namespace normalizer_factory
diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h
index 817f530..4cbfa63 100644
--- a/icing/transform/normalizer.h
+++ b/icing/transform/normalizer.h
@@ -28,8 +28,7 @@ namespace lib {
//
// Example use:
// ICING_ASSIGN_OR_RETURN(auto normalizer,
-// normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C,
-// /*max_term_byte_size=*/5);
+// normalizer_factory::Create(/*max_term_byte_size=*/5);
//
// std::string normalized_text = normalizer->NormalizeText("HELLO!");
// ICING_LOG(INFO) << normalized_text; // prints "hello"
diff --git a/icing/transform/simple/none-normalizer-factory.cc b/icing/transform/simple/none-normalizer-factory.cc
new file mode 100644
index 0000000..6b35270
--- /dev/null
+++ b/icing/transform/simple/none-normalizer-factory.cc
@@ -0,0 +1,53 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
+#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/transform/normalizer.h"
+#include "icing/transform/simple/none-normalizer.h"
+
+namespace icing {
+namespace lib {
+
+namespace normalizer_factory {
+
+// Creates a dummy normalizer. The term is not normalized, but
+// the text will be truncated to max_term_byte_size if it exceeds the max size.
+//
+// Returns:
+// A normalizer on success
+// INVALID_ARGUMENT if max_term_byte_size <= 0
+// INTERNAL_ERROR on errors
+libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
+ int max_term_byte_size) {
+ if (max_term_byte_size <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "max_term_byte_size must be greater than zero.");
+ }
+
+ return std::make_unique<NoneNormalizer>(max_term_byte_size);
+}
+
+} // namespace normalizer_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/none-normalizer.h b/icing/transform/simple/none-normalizer.h
index b734bef..47085e1 100644
--- a/icing/transform/none-normalizer.h
+++ b/icing/transform/simple/none-normalizer.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_TRANSFORM_NONE_NORMALIZER_H_
-#define ICING_TRANSFORM_NONE_NORMALIZER_H_
+#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
+#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
#include <string>
#include <string_view>
@@ -30,7 +30,7 @@ namespace lib {
// max_term_byte_size.
class NoneNormalizer : public Normalizer {
public:
- NoneNormalizer(int max_term_byte_size)
+ explicit NoneNormalizer(int max_term_byte_size)
: max_term_byte_size_(max_term_byte_size){};
std::string NormalizeTerm(std::string_view term) const override {
@@ -48,4 +48,4 @@ class NoneNormalizer : public Normalizer {
} // namespace lib
} // namespace icing
-#endif // ICING_TRANSFORM_NONE_NORMALIZER_H_
+#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
diff --git a/icing/transform/none-normalizer_test.cc b/icing/transform/simple/none-normalizer_test.cc
index e322258..e074828 100644
--- a/icing/transform/none-normalizer_test.cc
+++ b/icing/transform/simple/none-normalizer_test.cc
@@ -27,25 +27,20 @@ namespace {
using ::testing::Eq;
TEST(NoneNormalizerTest, Creation) {
- EXPECT_THAT(
- normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
- /*max_term_byte_size=*/5),
- IsOk());
- EXPECT_THAT(
- normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
- /*max_term_byte_size=*/0),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(
- normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
- /*max_term_byte_size=*/-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/5),
+ IsOk());
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(normalizer_factory::Create(
+ /*max_term_byte_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST(IcuNormalizerTest, NoNormalizationDone) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto normalizer,
- normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
- /*max_term_byte_size=*/1000));
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
EXPECT_THAT(normalizer->NormalizeTerm(""), Eq(""));
EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world"));
@@ -63,10 +58,8 @@ TEST(IcuNormalizerTest, NoNormalizationDone) {
}
TEST(NoneNormalizerTest, Truncate) {
- ICING_ASSERT_OK_AND_ASSIGN(
- auto normalizer,
- normalizer_factory::Create(normalizer_factory::NormalizerType::NONE,
- /*max_term_byte_size=*/5));
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/5));
// Won't be truncated
EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
diff --git a/icing/util/document-validator.cc b/icing/util/document-validator.cc
index 5b588e7..36b84f8 100644
--- a/icing/util/document-validator.cc
+++ b/icing/util/document-validator.cc
@@ -72,11 +72,9 @@ libtextclassifier3::Status DocumentValidator::Validate(
const SchemaTypeConfigProto* type_config =
std::move(type_config_or).ValueOrDie();
- int32_t num_required_properties_expected = 0;
int32_t num_required_properties_actual = 0;
- PropertyConfigMap property_config_map;
- SchemaUtil::BuildPropertyConfigMap(*type_config, &property_config_map,
- &num_required_properties_expected);
+ SchemaUtil::ParsedPropertyConfigs parsed_property_configs =
+ SchemaUtil::ParsePropertyConfigs(*type_config);
std::unordered_set<std::string_view> unique_properties;
for (const PropertyProto& property : document.properties()) {
@@ -93,8 +91,9 @@ libtextclassifier3::Status DocumentValidator::Validate(
document.namespace_(), ", ", document.uri(), ")."));
}
- const auto& property_iter = property_config_map.find(property.name());
- if (property_iter == property_config_map.end()) {
+ const auto& property_iter =
+ parsed_property_configs.property_config_map.find(property.name());
+ if (property_iter == parsed_property_configs.property_config_map.end()) {
return absl_ports::NotFoundError(absl_ports::StrCat(
"Property config '", property.name(), "' not found for key: (",
document.namespace_(), ", ", document.uri(), ")."));
@@ -165,7 +164,8 @@ libtextclassifier3::Status DocumentValidator::Validate(
}
}
}
- if (num_required_properties_actual < num_required_properties_expected) {
+ if (num_required_properties_actual <
+ parsed_property_configs.num_required_properties) {
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("One or more required fields missing for key: (",
document.namespace_(), ", ", document.uri(), ")."));
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index 2345339..9cf992f 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -17,6 +17,17 @@
#include <cctype>
#include <string_view>
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/util/logging.h"
+#include "unicode/uchar.h"
+#include "unicode/umachine.h"
+#include "unicode/ustring.h"
+#include "unicode/utf16.h"
+#include "unicode/utf8.h"
+#include "unicode/utypes.h"
+
namespace icing {
namespace lib {
namespace i18n_utils {
@@ -31,12 +42,84 @@ const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";
} // namespace
+libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
+ const std::u16string& utf16_string) {
+ std::string utf8_string;
+ // Allocates the maximum possible UTF8 string length:
+ // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL.
+ //
+ // NOTE: we need to call resize() but not reserve() because values can't be
+ // set at positions after length().
+ utf8_string.resize(utf16_string.length() * 3 + 1);
+
+ int result_length = 0;
+ UErrorCode status = U_ZERO_ERROR;
+ u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length,
+ utf16_string.data(), utf16_string.length(), &status);
+ // Corrects the length
+ utf8_string.resize(result_length);
+
+ if (U_FAILURE(status)) {
+ return absl_ports::InternalError("Failed to convert UTF16 string to UTF8");
+ }
+ return utf8_string;
+}
+
+libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
+ std::string_view utf8_string) {
+ std::u16string utf16_result;
+ // The UTF16 string won't be longer than its UTF8 format
+ //
+ // NOTE: we need to call resize() but not reserve() because values can't be
+ // set at positions after length().
+ utf16_result.resize(utf8_string.length());
+
+ int result_length = 0;
+ UErrorCode status = U_ZERO_ERROR;
+ u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length,
+ utf8_string.data(), utf8_string.length(), &status);
+ // Corrects the length
+ utf16_result.resize(result_length);
+
+ if (U_FAILURE(status)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to convert UTF8 string '", utf8_string, "' to UTF16"));
+ }
+ return utf16_result;
+}
+
UChar32 GetUChar32At(const char* data, int length, int position) {
- // We don't handle Unicode, i.e. anything more than 1 byte.
- return data[position];
+ UChar32 uchar32;
+ U8_NEXT_OR_FFFD(data, position, length, uchar32);
+ return uchar32;
}
-bool IsAscii(char c) { return (c & 0x80) == 0; }
+void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
+ if (str == nullptr || truncate_to_length >= str->length()) {
+ return;
+ }
+
+ while (truncate_to_length > 0) {
+ if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
+ str->resize(truncate_to_length);
+ return;
+ }
+ truncate_to_length--;
+ }
+
+ // Truncates to an empty string
+ str->resize(0);
+}
+
+bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
+
+bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
+
+int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
+
+int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }
+
+bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); }
bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
if (IsAscii(input[position])) {
@@ -45,18 +128,43 @@ bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
}
return ascii_icu_punctuation.find(input[position]) != std::string::npos;
}
-
- // If it's not ASCII, we can't process Unicode so we don't know.
- return false;
+ UChar32 c = GetUChar32At(input.data(), input.length(), position);
+ if (char_len_out != nullptr) {
+ *char_len_out = U8_LENGTH(c);
+ }
+ return u_ispunct(c);
}
bool IsWhitespaceAt(std::string_view input, int position) {
if (IsAscii(input[position])) {
return std::isspace(input[position]);
}
+ UChar32 c = GetUChar32At(input.data(), input.length(), position);
+ return u_isUWhiteSpace(c);
+}
+
+bool IsAlphabeticAt(std::string_view input, int position) {
+ if (IsAscii(input[position])) {
+ return std::isalpha(input[position]);
+ }
+ UChar32 c = GetUChar32At(input.data(), input.length(), position);
+ return u_isUAlphabetic(c);
+}
- // If it's not ASCII, we can't process Unicode so we don't know.
- return false;
+void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) {
+ uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes
+
+ int utf8_index = 0;
+ UBool has_error = FALSE;
+
+ // utf8_index is advanced to the end of the contents if successful
+ U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);
+
+ if (has_error) {
+ ICING_LOG(WARNING) << "Error appending UChar32 to the UTF8 string.";
+ return;
+ }
+ utf8_string->append(reinterpret_cast<char*>(utf8_buffer), utf8_index);
}
} // namespace i18n_utils
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index 141b9af..e103bab 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -18,43 +18,60 @@
#include <string>
#include <string_view>
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "unicode/umachine.h"
+
namespace icing {
namespace lib {
-// These are included for uses when we don't have access to ICU.
-//
-// Defined in ICU;
-// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/umachine_8h.html#a09fff5c3b5a5b015324dc3ec3cf92809
-using UChar32 = int32_t;
-
-// Defined in ICU:
-// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/utf8_8h.html#aa2298b48749d9f45772c8f5a6885464a
-#define U8_MAX_LENGTH 4
-
-// Defined in ICU:
-// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/uloc_8h.html#aa55404d3c725af4e05e65e5b40a6e13d
-#define ULOC_US "en_US"
-
// Internationalization utils that use standard utilities or custom code. Does
-// not require any special dependencies, i.e. for use when the library is NOT
-// guaranteed to have access to ICU.
-//
-// Note: This does not handle Unicode.
-//
-// TODO(cassiewang): Figure out if we want to keep this file as a non-ICU
-// solution long-term, or if we'll do something along the lines of reverse-jni,
-// etc.
+// not require any special dependencies, such as data files for ICU.
namespace i18n_utils {
// An invalid value defined by Unicode.
static constexpr UChar32 kInvalidUChar32 = 0xFFFD;
+// Converts a UTF16 string to a UTF8 string.
+//
+// Returns:
+// A UTF8 string on success
+// INTERNAL_ERROR on any failures
+libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
+ const std::u16string& utf16_string);
+
+// Converts a UTF8 string to a UTF16 string.
+//
+// Returns:
+// A UTF16 string on success
+// INTERNAL_ERROR on any failures
+libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
+ std::string_view utf8_string);
+
// Returns the char at the given position.
UChar32 GetUChar32At(const char* data, int length, int position);
+// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
+// in the middle. The string will be truncated in place.
+void SafeTruncateUtf8(std::string* str, int truncate_to_length);
+
// Checks if the single char is within ASCII range.
bool IsAscii(char c);
+// Checks if the Unicode char is within ASCII range.
+bool IsAscii(UChar32 c);
+
+// Returns how many code units (char) are used for the UTF-8 encoding of this
+// Unicode character. Returns 0 if not valid.
+int GetUtf8Length(UChar32 c);
+
+// Returns how many code units (char16_t) are used for the UTF-16 encoding of
+// this Unicode character. Returns 0 if not valid.
+int GetUtf16Length(UChar32 c);
+
+// Checks if the single char is the first byte of a UTF8 character, note
+// that a single ASCII char is also considered a lead byte.
+bool IsLeadUtf8Byte(char c);
+
// Checks if the character at position is punctuation. Assigns the length of the
// character at position to *char_len_out if the character at position is valid
// punctuation and char_len_out is not null.
@@ -64,6 +81,11 @@ bool IsPunctuationAt(std::string_view input, int position,
// Checks if the character at position is a whitespace.
bool IsWhitespaceAt(std::string_view input, int position);
+// Checks if the character at position is a whitespace.
+bool IsAlphabeticAt(std::string_view input, int position);
+
+void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar);
+
} // namespace i18n_utils
} // namespace lib
} // namespace icing
diff --git a/icing/util/i18n-utils_test.cc b/icing/util/i18n-utils_test.cc
new file mode 100644
index 0000000..a1e8d4e
--- /dev/null
+++ b/icing/util/i18n-utils_test.cc
@@ -0,0 +1,141 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/i18n-utils.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "unicode/uchar.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+TEST(IcuI18nUtilsTest, IsPunctuationAtSameAsIcuIsPunct) {
+ // Iterate through ASCII values
+ for (int i = 0; i <= 127; ++i) {
+ char ascii = i;
+
+ std::string ascii_string = "";
+ ascii_string.push_back(ascii);
+
+ EXPECT_EQ(i18n_utils::IsPunctuationAt(ascii_string, /*position=*/0),
+
+ u_ispunct(ascii));
+ }
+}
+
+TEST(IcuI18nUtilsTest, IsAlphabeticAt) {
+ // Test alphabetic and non-alphabetic ascii characters
+ constexpr std::string_view kSomeAscii = "iJ?9";
+ EXPECT_TRUE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/0)); // 'i'
+ EXPECT_TRUE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/1)); // 'J'
+ EXPECT_FALSE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/2)); // '?'
+ EXPECT_FALSE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/3)); // '9'
+
+ constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+ EXPECT_FALSE(
+ i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/0)); // '👏'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 0)),
+ 4);
+ EXPECT_TRUE(
+ i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/4)); // 'ñ'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 4)),
+ 2);
+ EXPECT_FALSE(
+ i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/6)); // '①'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 6)),
+ 3);
+ EXPECT_TRUE(
+ i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/9)); // 'カ'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 9)),
+ 3);
+}
+
+TEST(IcuI18nUtilsTest, GetUtf8Length) {
+ // Test alphabetic and non-alphabetic ascii characters
+ constexpr std::string_view kSomeAscii = "iJ?9";
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeAscii.data(), kSomeAscii.length(), 0)),
+ 1); // 'i'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeAscii.data(), kSomeAscii.length(), 1)),
+ 1); // 'J'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeAscii.data(), kSomeAscii.length(), 2)),
+ 1); // '?'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeAscii.data(), kSomeAscii.length(), 3)),
+ 1); // '9'
+
+ constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 0)),
+ 4); // '👏'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 4)),
+ 2); // 'ñ'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 6)),
+ 3); // '①'
+ EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At(
+ kSomeNonAscii.data(), kSomeNonAscii.length(), 9)),
+ 3); // 'カ'
+}
+
+TEST(IcuI18nUtilsTest, SafeTruncate) {
+ // Test alphabetic and non-alphabetic ascii characters
+ constexpr std::string_view kSomeAscii = "iJ?9";
+ std::string truncated(kSomeAscii);
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length() + 1);
+ EXPECT_THAT(truncated, Eq("iJ?9"));
+ truncated = kSomeAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length());
+ EXPECT_THAT(truncated, Eq("iJ?9"));
+ truncated = kSomeAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length() - 1);
+ EXPECT_THAT(truncated, Eq("iJ?"));
+
+ constexpr std::string_view kSomeNonAscii = "👏ñ①カ";
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() + 1);
+ EXPECT_THAT(truncated, Eq("👏ñ①カ"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length());
+ EXPECT_THAT(truncated, Eq("👏ñ①カ"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 1);
+ EXPECT_THAT(truncated, Eq("👏ñ①"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 2);
+ EXPECT_THAT(truncated, Eq("👏ñ①"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 3);
+ EXPECT_THAT(truncated, Eq("👏ñ①"));
+ truncated = kSomeNonAscii;
+ i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 4);
+ EXPECT_THAT(truncated, Eq("👏ñ"));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/icu-i18n-utils.cc b/icing/util/icu-i18n-utils.cc
deleted file mode 100644
index 89e4eec..0000000
--- a/icing/util/icu-i18n-utils.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/util/icu-i18n-utils.h"
-
-#include <cctype>
-#include <string>
-#include <string_view>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/absl_ports/str_cat.h"
-#include "unicode/umachine.h"
-#include "unicode/unorm2.h"
-#include "unicode/ustring.h"
-#include "unicode/utf8.h"
-
-namespace icing {
-namespace lib {
-namespace icu_i18n_utils {
-
-namespace {
-
-// All ASCII punctuation that's also in a Unicode Punctuation category
-// (https://www.fileformat.info/info/unicode/category/index.htm). The set of
-// characters that are regarded as punctuation is not the same for std::ispunct
-// and u_ispunct.
-const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";
-
-} // namespace
-
-libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
- const std::u16string& utf16_string) {
- std::string utf8_string;
- // Allocates the maximum possible UTF8 string length:
- // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL.
- //
- // NOTE: we need to call resize() but not reserve() because values can't be
- // set at positions after length().
- utf8_string.resize(utf16_string.length() * 3 + 1);
-
- int result_length = 0;
- UErrorCode status = U_ZERO_ERROR;
- u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length,
- utf16_string.data(), utf16_string.length(), &status);
- // Corrects the length
- utf8_string.resize(result_length);
-
- if (U_FAILURE(status)) {
- return absl_ports::InternalError("Failed to convert UTF16 string to UTF8");
- }
- return utf8_string;
-}
-
-libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
- std::string_view utf8_string) {
- std::u16string utf16_result;
- // The UTF16 string won't be longer than its UTF8 format
- //
- // NOTE: we need to call resize() but not reserve() because values can't be
- // set at positions after length().
- utf16_result.resize(utf8_string.length());
-
- int result_length = 0;
- UErrorCode status = U_ZERO_ERROR;
- u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length,
- utf8_string.data(), utf8_string.length(), &status);
- // Corrects the length
- utf16_result.resize(result_length);
-
- if (U_FAILURE(status)) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Failed to convert UTF8 string '", utf8_string, "' to UTF16"));
- }
- return utf16_result;
-}
-
-UChar32 GetUChar32At(const char* data, int length, int position) {
- UChar32 uchar32;
- U8_NEXT_OR_FFFD(data, position, length, uchar32);
- return uchar32;
-}
-
-void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
- if (str == nullptr || truncate_to_length >= str->length()) {
- return;
- }
-
- while (truncate_to_length > 0) {
- if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
- str->resize(truncate_to_length);
- return;
- }
- truncate_to_length--;
- }
-
- // Truncates to an empty string
- str->resize(0);
-}
-
-bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
-
-bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
-
-int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
-
-bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); }
-
-bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
- if (IsAscii(input[position])) {
- if (char_len_out != nullptr) {
- *char_len_out = 1;
- }
- return ascii_icu_punctuation.find(input[position]) != std::string::npos;
- }
- UChar32 c = GetUChar32At(input.data(), input.length(), position);
- if (char_len_out != nullptr) {
- *char_len_out = U8_LENGTH(c);
- }
- return u_ispunct(c);
-}
-
-bool IsWhitespaceAt(std::string_view input, int position) {
- if (IsAscii(input[position])) {
- return std::isspace(input[position]);
- }
- UChar32 c = GetUChar32At(input.data(), input.length(), position);
- return u_isUWhiteSpace(c);
-}
-
-bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
- char* char_out) {
- if (IsAscii(uchar32_in)) {
- // The Unicode character is within ASCII range
- if (char_out != nullptr) {
- *char_out = uchar32_in;
- }
- return true;
- }
-
- // Maximum number of pieces a Unicode character can be decomposed into.
- // TODO(samzheng) figure out if this number is proper.
- constexpr int kDecompositionBufferCapacity = 5;
-
- // A buffer used to store Unicode decomposition mappings of only one
- // character.
- UChar decomposition_buffer[kDecompositionBufferCapacity];
-
- // Decomposes the Unicode character, trying to get an ASCII char and some
- // diacritic chars.
- UErrorCode status = U_ZERO_ERROR;
- if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
- kDecompositionBufferCapacity, &status) > 0 &&
- !U_FAILURE(status) && icu_i18n_utils::IsAscii(decomposition_buffer[0])) {
- if (char_out != nullptr) {
- *char_out = decomposition_buffer[0];
- }
- return true;
- }
- return false;
-}
-
-} // namespace icu_i18n_utils
-} // namespace lib
-} // namespace icing
diff --git a/icing/util/icu-i18n-utils.h b/icing/util/icu-i18n-utils.h
deleted file mode 100644
index 4d29cf0..0000000
--- a/icing/util/icu-i18n-utils.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_UTIL_ICU_I18N_UTILS_H_
-#define ICING_UTIL_ICU_I18N_UTILS_H_
-
-#include <string>
-#include <string_view>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "unicode/umachine.h"
-#include "unicode/unorm2.h"
-
-// Rely on this to transitively have access to U8_MAX_LENGTH, so all users can
-// depend on either icu-i18n-utils or i18n-utils.
-#include "unicode/utf8.h"
-
-// Rely on this to transitively have access to ULOC_US, so all users can depend
-// on either icu-i18n-utils or i18n-utils.
-#include "unicode/uloc.h"
-
-namespace icing {
-namespace lib {
-
-// Internationalization utils that use ICU methods under the hood. For use when
-// the library is guaranteed to have access to ICU.
-namespace icu_i18n_utils {
-
-// An invalid value defined by Unicode.
-static constexpr UChar32 kInvalidUChar32 = 0xFFFD;
-
-// Converts a UTF16 string to a UTF8 string.
-//
-// Returns:
-// A UTF8 string on success
-// INTERNAL_ERROR on any failures
-libtextclassifier3::StatusOr<std::string> Utf16ToUtf8(
- const std::u16string& utf16_string);
-
-// Converts a UTF8 string to a UTF16 string.
-//
-// Returns:
-// A UTF16 string on success
-// INTERNAL_ERROR on any failures
-libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
- std::string_view utf8_string);
-
-// Returns the Unicode char at the given position. If anything wrong happens, an
-// invalid value 0xFFFD is returned.
-UChar32 GetUChar32At(const char* data, int length, int position);
-
-// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
-// in the middle. The string will be truncated in place.
-void SafeTruncateUtf8(std::string* str, int truncate_to_length);
-
-// Checks if the single char is within ASCII range.
-bool IsAscii(char c);
-
-// Checks if the Unicode char is within ASCII range.
-bool IsAscii(UChar32 c);
-
-// Returns how many code units (bytes) are used for the UTF-8 encoding of this
-// Unicode character. Returns 0 if not valid.
-int GetUtf8Length(UChar32 c);
-
-// Checks if the single char is the first byte of a UTF8 character, note
-// that a single ASCII char is also considered a lead byte.
-bool IsLeadUtf8Byte(char c);
-
-// Checks if the character at position is punctuation. Assigns the length of the
-// character at position to *char_len_out if the character at position is valid
-// punctuation and char_len_out is not null.
-bool IsPunctuationAt(std::string_view input, int position,
- int* char_len_out = nullptr);
-
-// Checks if the character at position is a whitespace.
-bool IsWhitespaceAt(std::string_view input, int position);
-
-// Transforms a Unicode character with diacritics to its counterpart in ASCII
-// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
-// the transformation is successful.
-//
-// NOTE: According to our convention this function should have returned
-// StatusOr<char>. However, this function is performance-sensitive because is
-// could be called on every Latin character in normalization, so we make it
-// return a bool here to save a bit more time and memory.
-bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
- char* char_out);
-
-} // namespace icu_i18n_utils
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_UTIL_ICU_I18N_UTILS_H_