diff options
author | Tim Barron <tjbarron@google.com> | 2020-06-05 13:55:31 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2020-06-05 14:04:31 -0700 |
commit | a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf (patch) | |
tree | 090955adb6f2abfc09f5275d6bab35a2c0d74198 /icing | |
parent | 79321d1f286ac650cc99fcf795a67c5dde8c0597 (diff) | |
download | icing-a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf.tar.gz |
Copy over changes made to Google3 codebase in Icing.
Change-Id: Ia36edb0a1b085e249dabfc220a5b72418063604f
Diffstat (limited to 'icing')
102 files changed, 7414 insertions, 1342 deletions
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h index 0b36e18..62943b8 100644 --- a/icing/file/file-backed-proto-log.h +++ b/icing/file/file-backed-proto-log.h @@ -210,13 +210,23 @@ class FileBackedProtoLog { // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const; - // Calculates and returns the disk usage in bytes. + // Calculates and returns the disk usage in bytes. Rounds up to the nearest + // block size. // // Returns: // Disk usage on success // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + // Returns the file size of all the elements held in the log. File size is in + // bytes. This excludes the size of any internal metadata of the log, e.g. the + // log's header. + // + // Returns: + // File size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; + // An iterator helping to find offsets of all the protos in file. // Example usage: // @@ -736,6 +746,17 @@ libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage() } template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> +FileBackedProtoLog<ProtoT>::GetElementsFileSize() const { + int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (total_file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + "Failed to get file size of elments in the proto log"); + } + return total_file_size - sizeof(Header); +} + +template <typename ProtoT> FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem, const std::string& file_path, int64_t initial_offset) diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h index f13b67b..27d03b2 100644 --- a/icing/file/file-backed-vector.h +++ b/icing/file/file-backed-vector.h @@ -194,13 +194,23 @@ class FileBackedVector { // INTERNAL on I/O error libtextclassifier3::Status PersistToDisk(); - // Calculates and returns the disk usage in bytes. + // Calculates and returns the disk usage in bytes. Rounds up to the nearest + // block size. // // Returns: // Disk usage on success // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + // Returns the file size of the all the elements held in the vector. File size + // is in bytes. This excludes the size of any internal metadata of the vector, + // e.g. the vector's header. + // + // Returns: + // File size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; + // Accessors. const T* array() const { return reinterpret_cast<const T*>(mmapped_file_->region()); @@ -705,6 +715,17 @@ libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetDiskUsage() return size; } +template <typename T> +libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize() + const { + int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (total_file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + "Failed to get file size of elements in the file-backed vector"); + } + return total_file_size - sizeof(Header); +} + } // namespace lib } // namespace icing diff --git a/icing/icu-data-file-helper.cc b/icing/helpers/icu/icu-data-file-helper.cc index 9741dbb..5cf6a1d 100644 --- a/icing/icu-data-file-helper.cc +++ b/icing/helpers/icu/icu-data-file-helper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include <sys/mman.h> diff --git a/icing/icu-data-file-helper.h b/icing/helpers/icu/icu-data-file-helper.h index e92491d..90f5bc7 100644 --- a/icing/icu-data-file-helper.h +++ b/icing/helpers/icu/icu-data-file-helper.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_ICU_DATA_FILE_HELPER -#define ICING_ICU_DATA_FILE_HELPER +#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER +#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER #include "icing/text_classifier/lib3/utils/base/status.h" @@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile( } // namespace lib } // namespace icing -#endif // ICING_ICU_DATA_FILE_HELPER +#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 6dc535c..01a2922 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -194,19 +194,22 @@ void TransformStatus(const libtextclassifier3::Status& internal_status, } // namespace -IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options) +IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options, + std::unique_ptr<const JniCache> jni_cache) : IcingSearchEngine(options, std::make_unique<Filesystem>(), - std::make_unique<Clock>()) {} + std::make_unique<Clock>(), std::move(jni_cache)) {} IcingSearchEngine::IcingSearchEngine( IcingSearchEngineOptions options, - std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock) + std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock, + std::unique_ptr<const JniCache> jni_cache) : options_(std::move(options)), filesystem_(std::move(filesystem)), icing_filesystem_(std::make_unique<IcingFilesystem>()), clock_(std::move(clock)), result_state_manager_(performance_configuration_.max_num_hits_per_query, - performance_configuration_.max_num_cache_results) { + performance_configuration_.max_num_cache_results), + jni_cache_(std::move(jni_cache)) { ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir(); } @@ -220,23 +223,25 @@ IcingSearchEngine::~IcingSearchEngine() { } InitializeResultProto IcingSearchEngine::Initialize() { + // This method does both read and write so we need a writer lock. Using two + // locks (reader and writer) has the chance to be interrupted during + // switching. + absl_ports::unique_lock l(&mutex_); + return InternalInitialize(); +} + +InitializeResultProto IcingSearchEngine::InternalInitialize() { ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: " << options_.base_dir(); InitializeResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); - if (initialized_) { // Already initialized. result_status->set_code(StatusProto::OK); return result_proto; } - // This method does both read and write so we need a writer lock. Using two - // locks (reader and writer) has the chance to be interrupted during - // switching. - absl_ports::unique_lock l(&mutex_); - // Releases result / query cache if any result_state_manager_.InvalidateAllResultStates(); @@ -269,14 +274,14 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers() { ICING_RETURN_IF_ERROR(InitializeSchemaStore()); ICING_RETURN_IF_ERROR(InitializeDocumentStore()); - TC3_ASSIGN_OR_RETURN( - language_segmenter_, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + // TODO(b/156383798) : Resolve how to specify the locale. + language_segmenter_factory::SegmenterOptions segmenter_options( + ULOC_US, jni_cache_.get()); + TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create( + std::move(segmenter_options))); - TC3_ASSIGN_OR_RETURN( - normalizer_, - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - options_.max_token_length())); + TC3_ASSIGN_OR_RETURN(normalizer_, + normalizer_factory::Create(options_.max_token_length())); ICING_RETURN_IF_ERROR(InitializeIndex()); @@ -416,14 +421,19 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( SetSchemaResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); + absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } + libtextclassifier3::Status status = SchemaUtil::Validate(new_schema); if (!status.ok()) { TransformStatus(status, result_status); return result_proto; } - absl_ports::unique_lock l(&mutex_); - auto lost_previous_schema_or = LostPreviousSchema(); if (!lost_previous_schema_or.ok()) { TransformStatus(lost_previous_schema_or.status(), result_status); @@ -498,6 +508,11 @@ GetSchemaResultProto IcingSearchEngine::GetSchema() { StatusProto* result_status = result_proto.mutable_status(); absl_ports::shared_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } auto schema_or = schema_store_->GetSchema(); if (!schema_or.ok()) { @@ -516,6 +531,11 @@ GetSchemaTypeResultProto IcingSearchEngine::GetSchemaType( StatusProto* result_status = result_proto.mutable_status(); absl_ports::shared_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type); if (!type_config_or.ok()) { @@ -542,6 +562,11 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { // the schema file to validate, and the schema could be changed in // SetSchema() which is protected by the same mutex. absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } auto document_id_or = document_store_->Put(document); if (!document_id_or.ok()) { @@ -576,6 +601,11 @@ GetResultProto IcingSearchEngine::Get(const std::string_view name_space, StatusProto* result_status = result_proto.mutable_status(); absl_ports::shared_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } auto document_or = document_store_->Get(name_space, uri); if (!document_or.ok()) { @@ -596,6 +626,11 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space, StatusProto* result_status = result_proto.mutable_status(); absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. @@ -616,14 +651,20 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace( const std::string_view name_space) { ICING_VLOG(1) << "Deleting namespace from doc store"; + DeleteByNamespaceResultProto delete_result; + StatusProto* result_status = delete_result.mutable_status(); absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return delete_result; + } // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = document_store_->DeleteByNamespace(name_space); - DeleteByNamespaceResultProto delete_result; - TransformStatus(status, delete_result.mutable_status()); + TransformStatus(status, result_status); if (!status.ok()) { ICING_LOG(ERROR) << status.error_message() << "Failed to delete Namespace: " << name_space; @@ -636,14 +677,20 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( const std::string_view schema_type) { ICING_VLOG(1) << "Deleting type from doc store"; + DeleteBySchemaTypeResultProto delete_result; + StatusProto* result_status = delete_result.mutable_status(); absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return delete_result; + } // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = document_store_->DeleteBySchemaType(schema_type); - DeleteBySchemaTypeResultProto delete_result; - TransformStatus(status, delete_result.mutable_status()); + TransformStatus(status, result_status); if (!status.ok()) { ICING_LOG(ERROR) << status.error_message() << "Failed to delete SchemaType: " << schema_type; @@ -659,6 +706,11 @@ PersistToDiskResultProto IcingSearchEngine::PersistToDisk() { StatusProto* result_status = result_proto.mutable_status(); absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } auto status = InternalPersistToDisk(); TransformStatus(status, result_status); @@ -678,6 +730,11 @@ OptimizeResultProto IcingSearchEngine::Optimize() { StatusProto* result_status = result_proto.mutable_status(); absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } // Releases result / query cache if any result_state_manager_.InvalidateAllResultStates(); @@ -729,6 +786,54 @@ OptimizeResultProto IcingSearchEngine::Optimize() { return result_proto; } +GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() { + ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine"; + + GetOptimizeInfoResultProto result_proto; + StatusProto* result_status = result_proto.mutable_status(); + + absl_ports::shared_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } + + // Get stats from DocumentStore + auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo(); + if (!doc_store_optimize_info_or.ok()) { + TransformStatus(doc_store_optimize_info_or.status(), result_status); + return result_proto; + } + DocumentStore::OptimizeInfo doc_store_optimize_info = + doc_store_optimize_info_or.ValueOrDie(); + result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs); + + if (doc_store_optimize_info.optimizable_docs == 0) { + // Can return early since there's nothing to calculate on the index side + result_proto.set_estimated_optimizable_bytes(0); + result_status->set_code(StatusProto::OK); + return result_proto; + } + + // Get stats from Index. + auto index_elements_size_or = index_->GetElementsSize(); + if (!index_elements_size_or.ok()) { + TransformStatus(index_elements_size_or.status(), result_status); + return result_proto; + } + int64_t index_elements_size = index_elements_size_or.ValueOrDie(); + + // Sum up the optimizable sizes from DocumentStore and Index + result_proto.set_estimated_optimizable_bytes( + index_elements_size * doc_store_optimize_info.optimizable_docs / + doc_store_optimize_info.total_docs + + doc_store_optimize_info.estimated_optimizable_bytes); + + result_status->set_code(StatusProto::OK); + return result_proto; +} + libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk() { ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk()); ICING_RETURN_IF_ERROR(document_store_->PersistToDisk()); @@ -808,6 +913,13 @@ SearchResultProto IcingSearchEngine::Search( const ResultSpecProto& result_spec) { SearchResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); + // TODO(b/146008613) Explore ideas to make this function read-only. + absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } libtextclassifier3::Status status = ValidateResultSpec(result_spec); if (!status.ok()) { @@ -820,9 +932,6 @@ SearchResultProto IcingSearchEngine::Search( return result_proto; } - // TODO(b/146008613) Explore ideas to make this function read-only. - absl_ports::unique_lock l(&mutex_); - // Gets unordered results from query processor auto query_processor_or = QueryProcessor::Create( index_.get(), language_segmenter_.get(), normalizer_.get(), @@ -917,6 +1026,11 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) { // ResultStateManager has its own writer lock, so here we only need a reader // lock for other components. absl_ports::shared_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } libtextclassifier3::StatusOr<PageResultState> page_result_state_or = result_state_manager_.GetNextPage(next_page_token); @@ -969,6 +1083,11 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) { } void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) { + absl_ports::shared_lock l(&mutex_); + if (!initialized_) { + ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!"; + return; + } result_state_manager_.InvalidateResultState(next_page_token); } @@ -1138,8 +1257,9 @@ ResetResultProto IcingSearchEngine::Reset() { return result_proto; } + absl_ports::unique_lock l(&mutex_); initialized_ = false; - if (Initialize().status().code() != StatusProto::OK) { + if (InternalInitialize().status().code() != StatusProto::OK) { // We shouldn't hit the following Initialize errors: // NOT_FOUND: all data was cleared, we aren't expecting anything // DATA_LOSS: all data was cleared, we aren't expecting anything diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index 196f243..746b5b4 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -20,6 +20,7 @@ #include <string> #include <string_view> +#include "icing/jni/jni-cache.h" #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/mutex.h" @@ -60,7 +61,12 @@ class IcingSearchEngine { uint32_t checksum; }; - explicit IcingSearchEngine(const IcingSearchEngineOptions& options); + // Note: It is only required to provide a pointer to a valid instance of + // JniCache if this instance needs to perform reverse-jni calls. Users on + // Linux and iOS should always provide a nullptr. + explicit IcingSearchEngine( + const IcingSearchEngineOptions& options, + std::unique_ptr<const JniCache> jni_cache = nullptr); // Calculates integrity checks and persists files to disk. ~IcingSearchEngine(); @@ -117,14 +123,17 @@ class IcingSearchEngine { // So, callers should only have to call this if the schema changed. // However, calling it multiple times with the same schema is a no-op. // - // On any error, Icing will keep using the older schema. + // On some errors, Icing will keep using the older schema, but on + // INTERNAL_ERROR, it is undefined to continue using Icing. // // Returns: // OK on success // INVALID_ARGUMENT if 'new_schema' is invalid - // FAILED_PRECONDITION if 'new_schema' is incompatible + // FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine + // has not been initialized yet. // INTERNAL_ERROR if Icing failed to store the new schema or upgrade - // existing data based on the new schema. + // existing data based on the new schema. Using Icing beyond this error is + // undefined and may cause crashes. // // TODO(cassiewang) Figure out, document (and maybe even enforce) the best // way ordering of calls between Initialize() and SetSchema(), both when @@ -149,6 +158,7 @@ class IcingSearchEngine { // Returns: // SchemaProto on success // NOT_FOUND if a schema has not been set yet + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet. // INTERNAL_ERROR on IO error GetSchemaResultProto GetSchema() ICING_LOCKS_EXCLUDED(mutex_); @@ -156,7 +166,8 @@ class IcingSearchEngine { // // Returns: // SchemaTypeConfigProto on success - // FAILED_PRECONDITION if a schema has not been set yet + // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine + // has not been initialized yet. // NOT_FOUND if there is no SchemaTypeConfig of schema_type in the // SchemaProto // INTERNAL_ERROR on IO error @@ -169,7 +180,8 @@ class IcingSearchEngine { // // Returns: // OK on success - // FAILED_PRECONDITION if a schema has not been set yet + // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine + // has not been initialized yet. // NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches // the document's schema // INTERNAL_ERROR on IO error @@ -189,6 +201,7 @@ class IcingSearchEngine { // Returns: // The document found on success // NOT_FOUND if the key doesn't exist or doc has been deleted + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on IO error GetResultProto Get(std::string_view name_space, std::string_view uri); @@ -202,6 +215,7 @@ class IcingSearchEngine { // Returns: // OK on success // NOT_FOUND if no document exists with namespace, uri + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on IO error DeleteResultProto Delete(std::string_view name_space, std::string_view uri) ICING_LOCKS_EXCLUDED(mutex_); @@ -216,6 +230,7 @@ class IcingSearchEngine { // Returns: // OK on success // NOT_FOUND if namespace doesn't exist + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on IO error DeleteByNamespaceResultProto DeleteByNamespace(std::string_view name_space) ICING_LOCKS_EXCLUDED(mutex_); @@ -230,6 +245,7 @@ class IcingSearchEngine { // Returns: // OK on success // NOT_FOUND if schema type doesn't exist + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on IO error DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type) ICING_LOCKS_EXCLUDED(mutex_); @@ -246,6 +262,7 @@ class IcingSearchEngine { // OK with results on success // INVALID_ARGUMENT if any of specs is invalid // ABORTED if failed to perform search but existing data is not affected + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on any other errors SearchResultProto Search(const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec, @@ -258,6 +275,7 @@ class IcingSearchEngine { // Returns a SearchResultProto with status: // OK with results on success // ABORTED if failed to get results but existing data is not affected + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on any other errors SearchResultProto GetNextPage(uint64_t next_page_token) ICING_LOCKS_EXCLUDED(mutex_); @@ -276,6 +294,7 @@ class IcingSearchEngine { // // Returns: // OK on success + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL on I/O error PersistToDiskResultProto PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_); @@ -284,25 +303,35 @@ class IcingSearchEngine { // resource-efficient. This method purely optimizes the internal files and // has no functional impact on what gets accepted/returned. // - // NOTE: This method should be called about once every 24 hours when the - // device is idle and charging. It can also be called when the system needs - // to free up extra disk-space. - // // WARNING: This method is CPU and IO intensive and depending on the // contents stored, it can take from a few seconds to a few minutes. // This call also blocks all read/write operations on Icing. // + // SUGGESTION: Assuming the client has no restrictions on their side, it's + // recommended to call this method about once every 24 hours when the + // device is idle and charging. It can also be called when the system needs + // to free up extra disk-space. + // // Returns: // OK on success // ABORTED_ERROR if optimization is aborted due to non-fatal errors before // actual modifications are made. // DATA_LOSS_ERROR on errors that could potentially cause data loss, // IcingSearchEngine is still functioning. - // INTERNAL_ERROR on any IO errors or other unrecoverable errors. Icing - // could be in an inconsistent state and might not be usable. + // INTERNAL_ERROR on any IO errors or other unrecoverable errors. Continued + // use of Icing is undefined. // Clients could clear and reinitialize IcingSearchEngine. + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet OptimizeResultProto Optimize() ICING_LOCKS_EXCLUDED(mutex_); + // Returns potential size and document savings if Optimize were called. + // + // Returns: + // OK on success + // FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet + // INTERNAL_ERROR on IO error + GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_); + // Clears all data from Icing and re-initializes. Clients DO NOT need to call // Initialize again. // @@ -319,13 +348,14 @@ class IcingSearchEngine { protected: IcingSearchEngine(IcingSearchEngineOptions options, std::unique_ptr<const Filesystem> filesystem, - std::unique_ptr<Clock> clock); + std::unique_ptr<Clock> clock, + std::unique_ptr<const JniCache> jni_cache = nullptr); private: const IcingSearchEngineOptions options_; const std::unique_ptr<const Filesystem> filesystem_; const std::unique_ptr<const IcingFilesystem> icing_filesystem_; - bool initialized_ = false; + bool initialized_ ICING_GUARDED_BY(mutex_) = false; // Abstraction for accessing time values. std::unique_ptr<Clock> clock_; @@ -355,6 +385,9 @@ class IcingSearchEngine { // Storage for all hits of content from the document store. std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_); + // Pointer to JNI class references + const std::unique_ptr<const JniCache> jni_cache_; + // Helper method to do the actual work to persist data to disk. We need this // separate method so that other public methods don't need to call // PersistToDisk(). Public methods calling each other may cause deadlock @@ -362,6 +395,12 @@ class IcingSearchEngine { libtextclassifier3::Status InternalPersistToDisk() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Helper method to the actual work to Initialize. We need this separate + // method so that other public methods don't need to call Initialize(). Public + // methods calling each other may cause deadlock issues. + InitializeResultProto InternalInitialize() + ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Helper method to initialize member variables. // // Returns: diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc index 632fd01..d31f836 100644 --- a/icing/icing-search-engine_fuzz_test.cc +++ b/icing/icing-search-engine_fuzz_test.cc @@ -18,8 +18,8 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/document-builder.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/icing-search-engine.h" -#include "icing/icu-data-file-helper.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" #include "icing/proto/scoring.pb.h" diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index 17795a3..baa469e 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -26,7 +26,7 @@ #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/file/mock-filesystem.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" @@ -1367,6 +1367,72 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldDeleteTemporaryDirectory) { EXPECT_FALSE(filesystem()->FileExists(tmp_file.c_str())); } +TEST_F(IcingSearchEngineTest, GetOptimizeInfoHasCorrectStats) { + DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .SetCreationTimestampMs(100) + .SetTtlMs(500) + .Build(); + + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetSystemTimeMilliseconds(1000); + + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::move(fake_clock)); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + + // Just initialized, nothing is optimizable yet. + GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); + + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(), + Eq(StatusProto::OK)); + ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK)); + + // Only have active documents, nothing is optimizable yet. + optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); + + // Deletes document1 + ASSERT_THAT(icing.Delete("namespace", "uri1").status().code(), + Eq(StatusProto::OK)); + + optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(1)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Gt(0)); + int64_t first_estimated_optimizable_bytes = + optimize_info.estimated_optimizable_bytes(); + + // Add a second document, but it'll be expired since the time (1000) is + // greater than the document's creation timestamp (100) + the document's ttl + // (500) + ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK)); + + optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(2)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), + Gt(first_estimated_optimizable_bytes)); + + // Optimize + ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK)); + + // Nothing is optimizable now that everything has been optimized away. + optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); +} + TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) { DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); @@ -1861,7 +1927,7 @@ TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) { document; // Time just has to be less than the document's creation timestamp (100) + the - // schema's ttl (500) + // document's ttl (500) auto fake_clock = std::make_unique<FakeClock>(); fake_clock->SetSystemTimeMilliseconds(400); @@ -1908,7 +1974,7 @@ TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) { expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); // Time just has to be greater than the document's creation timestamp (100) + - // the schema's ttl (500) + // the document's ttl (500) auto fake_clock = std::make_unique<FakeClock>(); fake_clock->SetSystemTimeMilliseconds(700); @@ -3150,6 +3216,49 @@ TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) { IsEmpty()); } +TEST_F(IcingSearchEngineTest, UninitializedInstanceFailsSafely) { + IcingSearchEngine icing(GetDefaultIcingOptions()); + + SchemaProto email_schema = CreateMessageSchema(); + EXPECT_THAT(icing.SetSchema(email_schema).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(icing.GetSchema().status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT( + icing.GetSchemaType(email_schema.types(0).schema_type()).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + + DocumentProto doc = CreateMessageDocument("namespace", "uri"); + EXPECT_THAT(icing.Put(doc).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(icing.Get(doc.namespace_(), doc.uri()).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(icing.Delete(doc.namespace_(), doc.uri()).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(icing.DeleteByNamespace(doc.namespace_()).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(icing.DeleteBySchemaType(email_schema.types(0).schema_type()) + .status() + .code(), + Eq(StatusProto::FAILED_PRECONDITION)); + + SearchSpecProto search_spec = SearchSpecProto::default_instance(); + ScoringSpecProto scoring_spec = ScoringSpecProto::default_instance(); + ResultSpecProto result_spec = ResultSpecProto::default_instance(); + EXPECT_THAT( + icing.Search(search_spec, scoring_spec, result_spec).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + constexpr int kSomePageToken = 12; + EXPECT_THAT(icing.GetNextPage(kSomePageToken).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + icing.InvalidateNextPageToken(kSomePageToken); // Verify this doesn't crash. + + EXPECT_THAT(icing.PersistToDisk().status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(icing.Optimize().status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 835478d..00d116f 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -16,7 +16,7 @@ #include "gmock/gmock.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" #include "icing/legacy/core/icing-string-util.h" @@ -140,7 +140,7 @@ std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem, std::unique_ptr<Normalizer> CreateNormalizer() { return normalizer_factory::Create( - normalizer_factory::NormalizerType::ICU4C, + /*max_term_byte_size=*/std::numeric_limits<int>::max()) .ValueOrDie(); } @@ -193,8 +193,7 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) { std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(); std::unique_ptr<IndexProcessor> index_processor = @@ -241,8 +240,7 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) { std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(); std::unique_ptr<IndexProcessor> index_processor = @@ -290,8 +288,7 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) { std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(); std::unique_ptr<IndexProcessor> index_processor = @@ -339,8 +336,7 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) { std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(); std::unique_ptr<IndexProcessor> index_processor = diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 126ea29..8dfb9c2 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -27,7 +27,7 @@ #include "icing/absl_ports/str_cat.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -91,14 +91,13 @@ class IndexProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &icing_filesystem_)); - ICING_ASSERT_OK_AND_ASSIGN( - lang_segmenter_, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( normalizer_, normalizer_factory::Create( - normalizer_factory::NormalizerType::ICU4C, + /*max_term_byte_size=*/std::numeric_limits<int32_t>::max())); ICING_ASSERT_OK_AND_ASSIGN( @@ -415,9 +414,8 @@ TEST_F(IndexProcessorTest, TooLongTokens) { IndexProcessor::Options options; options.max_tokens_per_document = 1000; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Normalizer> normalizer, - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( /*max_term_byte_size=*/4)); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/index/index.h b/icing/index/index.h index f287081..f30c8ad 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -113,6 +113,17 @@ class Index { lite_index_->GetDebugInfo(verbosity, out); } + // Returns the byte size of the all the elements held in the index. This + // excludes the size of any internal metadata of the index, e.g. the index's + // header. + // + // Returns: + // Byte size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsSize() const { + return lite_index_->GetElementsSize(); + } + // Create an iterator to iterate through all doc hit infos in the index that // match the term. section_id_mask can be set to ignore hits from sections not // listed in the mask. Eg. section_id_mask = 1U << 3; would only return hits diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index ff29135..070e82a 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -45,6 +45,7 @@ namespace { using ::testing::ElementsAre; using ::testing::Eq; +using ::testing::Gt; using ::testing::IsEmpty; using ::testing::IsTrue; using ::testing::NiceMock; @@ -621,12 +622,13 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) { EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, /*num_to_return=*/10), IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1), - EqualsTermMetadata("foo", 1)))); + EqualsTermMetadata("foo", 1)))); // namespace with id 1 has 1 result. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1)))); + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) { @@ -650,7 +652,7 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) { index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2}, /*num_to_return=*/10), IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 1)))); + EqualsTermMetadata("fool", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { @@ -672,9 +674,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { // Should return "fo", "foo" and "fool" across all namespaces. EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{}, /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1), - EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 1)))); + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { @@ -690,10 +692,22 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { EXPECT_THAT(edit2.AddHit("fool"), IsOk()); // 'foo' has 1 hit, 'fool' has 2 hits. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 2)))); + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, + /*num_to_return=*/10), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 2)))); +} + +TEST_F(IndexTest, GetElementsSize) { + // Check empty index. + EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Eq(0))); + + // Add an element. + Index::Editor edit = index_->Edit( + kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + EXPECT_THAT(edit.AddHit("foo"), IsOk()); + EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Gt(0))); } } // namespace diff --git a/icing/index/lite-index.cc b/icing/index/lite-index.cc index c9f68b5..489c53d 100644 --- a/icing/index/lite-index.cc +++ b/icing/index/lite-index.cc @@ -391,6 +391,29 @@ void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { lexicon_.GetDebugInfo(verbosity, out); } +libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const { + int64_t header_and_hit_buffer_file_size = + filesystem_->GetFileSize(hit_buffer_fd_.get()); + + if (header_and_hit_buffer_file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + "Failed to get element size of the LiteIndex's header and hit buffer"); + } + + int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); + if (lexicon_disk_usage == IcingFilesystem::kBadFileSize) { + return absl_ports::InternalError( + "Failed to get element size of LiteIndex's lexicon"); + } + + // On initialization, we grow the file to a padded size first. So this size + // won't count towards the size taken up by elements + size_t header_padded_size = IcingMMapper::page_aligned_size(header_size()); + + return header_and_hit_buffer_file_size - header_padded_size + + lexicon_disk_usage; +} + uint32_t LiteIndex::Seek(uint32_t term_id) { // Make searchable by sorting by hit buffer. uint32_t sort_len = header_->cur_size() - header_->searchable_end(); diff --git a/icing/index/lite-index.h b/icing/index/lite-index.h index 6d01f42..b60a947 100644 --- a/icing/index/lite-index.h +++ b/icing/index/lite-index.h @@ -205,6 +205,14 @@ class LiteIndex { // verbosity > 0, more detailed debug information from the lexicon. void GetDebugInfo(int verbosity, std::string* out) const; + // Returns the byte size of all the elements held in the index. This excludes + // the size of any internal metadata of the index, e.g. the index's header. + // + // Returns: + // Byte size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; + private: static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions(); @@ -228,15 +236,29 @@ class LiteIndex { // hit buffer if term_id is not present. uint32_t Seek(uint32_t term_id); + // File descriptor that points to where the header and hit buffer are written + // to. ScopedFd hit_buffer_fd_; + // Mmapped region past the header that stores the hits. IcingArrayStorage hit_buffer_; + + // Crc checksum of the hits, excludes the header. uint32_t hit_buffer_crc_; + + // Trie that maps indexed terms to their term id IcingDynamicTrie lexicon_; + // TODO(b/140437260): Port over to MemoryMappedFile + // Memory mapped region of the underlying file that reflects the header. IcingMMapper header_mmap_; + + // Wrapper around the mmapped header that contains stats on the lite index. std::unique_ptr<IcingLiteIndex_Header> header_; + + // Options used to initialize the LiteIndex. const Options options_; + // TODO(b/139087650) Move to icing::Filesystem const IcingFilesystem* const filesystem_; }; diff --git a/icing/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc index 109f717..b1b5420 100644 --- a/icing/icing-search-engine-jni.cc +++ b/icing/jni/icing-search-engine-jni.cc @@ -16,6 +16,7 @@ #include <string> +#include "icing/jni/jni-cache.h" #include <google/protobuf/message_lite.h> #include "icing/absl_ports/status_imports.h" #include "icing/icing-search-engine.h" @@ -26,6 +27,7 @@ #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" #include "icing/proto/search.pb.h" +#include "icing/util/status-macros.h" namespace { bool ParseProtoFromJniByteArray(JNIEnv* env, jbyteArray bytes, @@ -85,8 +87,12 @@ Java_com_google_android_icing_IcingSearchEngine_nativeCreate( return 0; } + std::unique_ptr<const icing::lib::JniCache> jni_cache; +#ifdef ICING_REVERSE_JNI_SEGMENTATION + ICING_ASSIGN_OR_RETURN(jni_cache, icing::lib::JniCache::Create(env), 0); +#endif // ICING_REVERSE_JNI_SEGMENTATION icing::lib::IcingSearchEngine* icing = - new icing::lib::IcingSearchEngine(options); + new icing::lib::IcingSearchEngine(options, std::move(jni_cache)); return reinterpret_cast<jlong>(icing); } @@ -282,6 +288,18 @@ Java_com_google_android_icing_IcingSearchEngine_nativeOptimize( } JNIEXPORT jbyteArray JNICALL +Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo( + JNIEnv* env, jclass clazz, jlong native_pointer) { + icing::lib::IcingSearchEngine* icing = + GetIcingSearchEnginePointer(native_pointer); + + icing::lib::GetOptimizeInfoResultProto get_optimize_info_result_proto = + icing->GetOptimizeInfo(); + + return SerializeProtoToJniByteArray(env, get_optimize_info_result_proto); +} + +JNIEXPORT jbyteArray JNICALL Java_com_google_android_icing_IcingSearchEngine_nativeReset( JNIEnv* env, jclass clazz, jlong native_pointer) { icing::lib::IcingSearchEngine* icing = diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc new file mode 100644 index 0000000..a186222 --- /dev/null +++ b/icing/jni/jni-cache.cc @@ -0,0 +1,216 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/jni/jni-cache.h" + +#include "icing/text_classifier/lib3/utils/java/jni-base.h" +#include "icing/text_classifier/lib3/utils/java/jni-helper.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +JniCache::JniCache(JavaVM* jvm) + : jvm(jvm), + string_class(nullptr, jvm), + string_utf8(nullptr, jvm), + locale_class(nullptr, jvm), + locale_us(nullptr, jvm), + breakiterator_class(nullptr, jvm) {} + +// The macros below are intended to reduce the boilerplate in Create and avoid +// easily introduced copy/paste errors. +#define ICING_GET_CLASS_OR_RETURN_NULL(FIELD, NAME) \ + { \ + ICING_ASSIGN_OR_RETURN( \ + libtextclassifier3::ScopedLocalRef<jclass> clazz, \ + libtextclassifier3::JniHelper::FindClass(env, NAME), nullptr); \ + result->FIELD##_class = \ + libtextclassifier3::MakeGlobalRef(clazz.get(), env, jvm); \ + if (result->FIELD##_class == nullptr) { \ + ICING_LOG(ERROR) << "Error finding class: " << NAME; \ + return nullptr; \ + } \ + } + +#define ICING_GET_OPTIONAL_CLASS(FIELD, NAME) \ + { \ + libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jclass>> \ + status_or_class = libtextclassifier3::JniHelper::FindClass(env, NAME); \ + if (status_or_class.ok()) { \ + result->FIELD##_class = libtextclassifier3::MakeGlobalRef( \ + std::move(status_or_class).ValueOrDie().get(), env, jvm); \ + } \ + } + +#define ICING_GET_METHOD(CLASS, FIELD, NAME, SIGNATURE) \ + result->CLASS##_##FIELD = \ + env->GetMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \ + if (!result->CLASS##_##FIELD) { \ + ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \ + << "Error finding method: " << NAME; \ + return absl_ports::AbortedError("Unable to get Java method."); \ + } + +#define ICING_GET_OPTIONAL_STATIC_METHOD(CLASS, FIELD, NAME, SIGNATURE) \ + if (result->CLASS##_class != nullptr) { \ + result->CLASS##_##FIELD = \ + env->GetStaticMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \ + env->ExceptionClear(); \ + } + +#define ICING_GET_STATIC_METHOD(CLASS, FIELD, NAME, SIGNATURE) \ + result->CLASS##_##FIELD = \ + env->GetStaticMethodID(result->CLASS##_class.get(), NAME, SIGNATURE); \ + if (!result->CLASS##_##FIELD) { \ + ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \ + << "Error finding method: " << NAME; \ + return absl_ports::AbortedError("Unable to get Java static method."); \ + } + +#define ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL(CLASS, FIELD, NAME, \ + SIGNATURE) \ + { \ + const jfieldID CLASS##_##FIELD##_field = \ + env->GetStaticFieldID(result->CLASS##_class.get(), NAME, SIGNATURE); \ + if (!CLASS##_##FIELD##_field) { \ + ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \ + << "Error finding field id: " << NAME; \ + return absl_ports::AbortedError("Unable to get Java field id."); \ + } \ + ICING_ASSIGN_OR_RETURN( \ + libtextclassifier3::ScopedLocalRef<jobject> static_object, \ + libtextclassifier3::JniHelper::GetStaticObjectField( \ + env, result->CLASS##_class.get(), CLASS##_##FIELD##_field), \ + nullptr); \ + result->CLASS##_##FIELD = \ + libtextclassifier3::MakeGlobalRef(static_object.get(), env, jvm); \ + if (result->CLASS##_##FIELD == nullptr) { \ + ICING_LOG(ERROR) << "Error finding field: " << NAME; \ + return nullptr; \ + } \ + } + +#define ICING_GET_STATIC_INT_FIELD(CLASS, FIELD, NAME) \ + const jfieldID CLASS##_##FIELD##_field = \ + env->GetStaticFieldID(result->CLASS##_class.get(), NAME, "I"); \ + << "Error finding field id: " << NAME; \ + if (!CLASS##_##FIELD##_field) { \ + ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \ + << "Error finding field id: " << NAME; \ + return absl_ports::AbortedError( \ + "Unable to get Java static int field id."); \ + } \ + result->CLASS##_##FIELD = env->GetStaticIntField( \ + result->CLASS##_class.get(), CLASS##_##FIELD##_field); \ + if (!result->CLASS##_##FIELD) { \ + ICING_LOG(WARNING) << __FILE__ << ":" << __LINE__ \ + << "Error finding field: " << NAME; \ + return absl_ports::AbortedError("Unable to get Java static int field."); \ + } + +libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> JniCache::Create( + JNIEnv* env) { + if (env == nullptr) { + return nullptr; + } + JavaVM* jvm = nullptr; + if (JNI_OK != env->GetJavaVM(&jvm) || jvm == nullptr) { + return nullptr; + } + std::unique_ptr<JniCache> result(new JniCache(jvm)); + + // String + ICING_GET_CLASS_OR_RETURN_NULL(string, "java/lang/String"); + ICING_GET_METHOD(string, constructor, "<init>", "([BLjava/lang/String;)V"); + ICING_GET_METHOD(string, code_point_count, "codePointCount", "(II)I"); + ICING_GET_METHOD(string, length, "length", "()I"); + ICING_ASSIGN_OR_RETURN( + libtextclassifier3::ScopedLocalRef<jstring> result_string, + libtextclassifier3::JniHelper::NewStringUTF(env, "UTF-8"), nullptr); + result->string_utf8 = + libtextclassifier3::MakeGlobalRef(result_string.get(), env, jvm); + if (result->string_utf8 == nullptr) { + return nullptr; + } + + // Locale + ICING_GET_CLASS_OR_RETURN_NULL(locale, "java/util/Locale"); + ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL(locale, us, "US", + "Ljava/util/Locale;"); + ICING_GET_METHOD(locale, constructor, "<init>", "(Ljava/lang/String;)V"); + ICING_GET_OPTIONAL_STATIC_METHOD(locale, for_language_tag, "forLanguageTag", + "(Ljava/lang/String;)Ljava/util/Locale;"); + + // BreakIteratorBatcher + ICING_GET_CLASS_OR_RETURN_NULL( + breakiterator, + "com/google/android/libraries/mdi/search/BreakIteratorBatcher"); + ICING_GET_METHOD(breakiterator, constructor, "<init>", + "(Ljava/util/Locale;)V"); + ICING_GET_METHOD(breakiterator, settext, "setText", "(Ljava/lang/String;)V"); + ICING_GET_METHOD(breakiterator, next, "next", "(I)[I"); + ICING_GET_METHOD(breakiterator, first, "first", "()I"); + ICING_GET_METHOD(breakiterator, following, "following", "(I)I"); + ICING_GET_METHOD(breakiterator, preceding, "preceding", "(I)I"); + + return result; +} + +#undef ICING_GET_STATIC_INT_FIELD +#undef ICING_GET_STATIC_OBJECT_FIELD_OR_RETURN_NULL +#undef ICING_GET_STATIC_METHOD +#undef ICING_GET_METHOD +#undef ICING_GET_CLASS_OR_RETURN_NULL +#undef ICING_GET_OPTIONAL_CLASS + +JNIEnv* JniCache::GetEnv() const { + void* env; + if (JNI_OK == jvm->GetEnv(&env, JNI_VERSION_1_4)) { + return reinterpret_cast<JNIEnv*>(env); + } else { + ICING_LOG(ERROR) << "Icing JniCache used on unattached thread"; + return nullptr; + } +} + +bool JniCache::ExceptionCheckAndClear() const { + return libtextclassifier3::JniExceptionCheckAndClear(GetEnv()); +} + +libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jstring>> +JniCache::ConvertToJavaString(const char* utf8_text, + const int utf8_text_size_bytes) const { + // Create java byte array. + JNIEnv* jenv = GetEnv(); + ICING_ASSIGN_OR_RETURN( + libtextclassifier3::ScopedLocalRef<jbyteArray> text_java_utf8, + libtextclassifier3::JniHelper::NewByteArray(jenv, utf8_text_size_bytes)); + + jenv->SetByteArrayRegion(text_java_utf8.get(), 0, utf8_text_size_bytes, + reinterpret_cast<const jbyte*>(utf8_text)); + + // Create the string with a UTF-8 charset. + ICING_ASSIGN_OR_RETURN(libtextclassifier3::ScopedLocalRef<jstring> result, + libtextclassifier3::JniHelper::NewObject<jstring>( + jenv, string_class.get(), string_constructor, + text_java_utf8.get(), string_utf8.get())); + + return result; +} + +} // namespace lib +} // namespace icing diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h new file mode 100644 index 0000000..a5f16c7 --- /dev/null +++ b/icing/jni/jni-cache.h @@ -0,0 +1,78 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_JNI_JNI_CACHE_H_ +#define ICING_JNI_JNI_CACHE_H_ + +#include <jni.h> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/text_classifier/lib3/utils/java/jni-base.h" + +namespace icing { +namespace lib { + +// A helper class to cache class and method pointers for calls from JNI to Java. +// (for implementations such as Java ICU that need to make calls from C++ to +// Java) +struct JniCache { + static libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> Create( + JNIEnv* env); + + // Returns the correct JNIEnv of the current thread. This allows multiple + // threads, each accessing the same instance of JniCache, to retrieve their + // unique JNIEnv pointers. + JNIEnv* GetEnv() const; + + // Returns true if there are any pending exceptions from the execution of JNI + // calls. Also clears the exception if any existed. + bool ExceptionCheckAndClear() const; + + JavaVM* jvm = nullptr; + + // java.lang.String + libtextclassifier3::ScopedGlobalRef<jclass> string_class; + jmethodID string_constructor = nullptr; + jmethodID string_code_point_count = nullptr; + jmethodID string_length = nullptr; + libtextclassifier3::ScopedGlobalRef<jstring> string_utf8; + + // java.util.Locale + libtextclassifier3::ScopedGlobalRef<jclass> locale_class; + libtextclassifier3::ScopedGlobalRef<jobject> locale_us; + jmethodID locale_constructor = nullptr; + jmethodID locale_for_language_tag = nullptr; + + // BreakIteratorBatcher + libtextclassifier3::ScopedGlobalRef<jclass> breakiterator_class; + jmethodID breakiterator_constructor = nullptr; + jmethodID breakiterator_settext = nullptr; + jmethodID breakiterator_next = nullptr; + jmethodID breakiterator_first = nullptr; + jmethodID breakiterator_following = nullptr; + jmethodID breakiterator_preceding = nullptr; + + // Helper to convert lib3 UnicodeText to Java strings. + libtextclassifier3::StatusOr<libtextclassifier3::ScopedLocalRef<jstring>> + ConvertToJavaString(const char* utf8_text, + const int utf8_text_size_bytes) const; + + private: + explicit JniCache(JavaVM* jvm); +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_JNI_JNI_CACHE_H_ diff --git a/icing/jni/jni.lds b/icing/jni/jni.lds new file mode 100644 index 0000000..401682a --- /dev/null +++ b/icing/jni/jni.lds @@ -0,0 +1,10 @@ +VERS_1.0 { + # Export JNI symbols. + global: + Java_*; + JNI_OnLoad; + + # Hide everything else + local: + *; +}; diff --git a/icing/jni/reverse-jni-break-iterator.cc b/icing/jni/reverse-jni-break-iterator.cc new file mode 100644 index 0000000..2a589c6 --- /dev/null +++ b/icing/jni/reverse-jni-break-iterator.cc @@ -0,0 +1,187 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/jni/reverse-jni-break-iterator.h" + +#include <math.h> + +#include <cassert> +#include <cctype> +#include <map> + +#include "icing/jni/jni-cache.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/text_classifier/lib3/utils/java/jni-base.h" +#include "icing/text_classifier/lib3/utils/java/jni-helper.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/util/status-macros.h" +#include <jni.h> + +namespace icing { +namespace lib { + +namespace { +// Chosen based on results in go/reverse-jni-benchmarks +static constexpr int kBatchSize = 100; +} // namespace + +// ----------------------------------------------------------------------------- +// Implementations that call out to JVM. Behold the beauty. +// ----------------------------------------------------------------------------- +libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>> +ReverseJniBreakIterator::Create(const JniCache* jni_cache, + std::string_view text, + std::string_view locale) { + if (jni_cache == nullptr) { + return absl_ports::InvalidArgumentError( + "Create must be called with a valid JniCache pointer!"); + } + + ICING_ASSIGN_OR_RETURN( + libtextclassifier3::ScopedLocalRef<jstring> java_text, + jni_cache->ConvertToJavaString(text.data(), text.length())); + if (java_text.get() == nullptr) { + return absl_ports::AbortedError("Failed to create Java String from input."); + } + + ICING_ASSIGN_OR_RETURN( + libtextclassifier3::ScopedLocalRef<jstring> java_locale_string, + jni_cache->ConvertToJavaString(locale.data(), locale.length())); + if (java_locale_string.get() == nullptr) { + return absl_ports::AbortedError( + "Failed to create Java String from locale."); + } + + JNIEnv* jenv = jni_cache->GetEnv(); + ICING_ASSIGN_OR_RETURN( + libtextclassifier3::ScopedLocalRef<jobject> java_locale, + libtextclassifier3::JniHelper::NewObject( + jenv, jni_cache->locale_class.get(), jni_cache->locale_constructor, + java_locale_string.get())); + if (java_locale.get() == nullptr) { + return absl_ports::AbortedError( + "Failed to create Java Locale from locale."); + } + + ICING_ASSIGN_OR_RETURN( + libtextclassifier3::ScopedLocalRef<jobject> local_iterator_batcher, + libtextclassifier3::JniHelper::NewObject( + jenv, jni_cache->breakiterator_class.get(), + jni_cache->breakiterator_constructor, java_locale.get())); + libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher = + libtextclassifier3::MakeGlobalRef(local_iterator_batcher.get(), jenv, + jni_cache->jvm); + if (iterator_batcher.get() == nullptr) { + return absl_ports::AbortedError( + "Failed to create Java BreakIteratorBatcher."); + } + + ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod( + jenv, iterator_batcher.get(), jni_cache->breakiterator_settext, + java_text.get())); + return std::unique_ptr<ReverseJniBreakIterator>( + new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher))); +} + +ReverseJniBreakIterator::ReverseJniBreakIterator( + const JniCache* jni_cache, + libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher) + : jni_cache_(jni_cache), + iterator_batcher_(std::move(iterator_batcher)), + is_done_(false), + is_almost_done_(false) {} + +int ReverseJniBreakIterator::Next() { + if (is_done_) { + return ReverseJniBreakIterator::kDone; + } + if (break_indices_cache_.empty()) { + if (FetchNextBatch() == ReverseJniBreakIterator::kDone) { + // Either there were no more results or an error occurred. Either way, + // mark ourselves as done and return. + is_done_ = true; + return ReverseJniBreakIterator::kDone; + } + is_almost_done_ = break_indices_cache_.size() < kBatchSize; + } + int break_index = break_indices_cache_.front(); + break_indices_cache_.pop(); + is_done_ = is_almost_done_ && break_indices_cache_.empty(); + return break_index; +} + +int ReverseJniBreakIterator::First() { + const int first_index = jni_cache_->GetEnv()->CallIntMethod( + iterator_batcher_.get(), jni_cache_->breakiterator_first); + if (jni_cache_->ExceptionCheckAndClear()) { + return ReverseJniBreakIterator::kDone; + } + ClearCache(); + return first_index; +} + +int ReverseJniBreakIterator::Preceding(int offset) { + const int preceding_index = jni_cache_->GetEnv()->CallIntMethod( + iterator_batcher_.get(), jni_cache_->breakiterator_preceding, offset); + if (jni_cache_->ExceptionCheckAndClear()) { + return ReverseJniBreakIterator::kDone; + } + ClearCache(); + return preceding_index; +} + +int ReverseJniBreakIterator::Following(int offset) { + const int following_index = jni_cache_->GetEnv()->CallIntMethod( + iterator_batcher_.get(), jni_cache_->breakiterator_following, offset); + if (jni_cache_->ExceptionCheckAndClear()) { + return ReverseJniBreakIterator::kDone; + } + ClearCache(); + return following_index; +} + +int ReverseJniBreakIterator::FetchNextBatch() { + ICING_ASSIGN_OR_RETURN( + libtextclassifier3::ScopedLocalRef<jintArray> break_indices, + libtextclassifier3::JniHelper::CallObjectMethod<jintArray>( + jni_cache_->GetEnv(), iterator_batcher_.get(), + jni_cache_->breakiterator_next, kBatchSize), + ReverseJniBreakIterator::kDone); + if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) { + return ReverseJniBreakIterator::kDone; + } + jint num_indices = jni_cache_->GetEnv()->GetArrayLength(break_indices.get()); + if (num_indices == 0) { + return ReverseJniBreakIterator::kDone; + } + jint* break_indices_arr = + static_cast<jint*>(jni_cache_->GetEnv()->GetPrimitiveArrayCritical( + break_indices.get(), nullptr)); + for (int i = 0; i < num_indices; ++i) { + break_indices_cache_.push(break_indices_arr[i]); + } + jni_cache_->GetEnv()->ReleasePrimitiveArrayCritical(break_indices.get(), + break_indices_arr, + /*mode=*/0); + return num_indices; +} + +void ReverseJniBreakIterator::ClearCache() { + break_indices_cache_ = std::queue<int>(); + is_done_ = false; + is_almost_done_ = false; +} + +} // namespace lib +} // namespace icing diff --git a/icing/jni/reverse-jni-break-iterator.h b/icing/jni/reverse-jni-break-iterator.h new file mode 100644 index 0000000..c1f05f4 --- /dev/null +++ b/icing/jni/reverse-jni-break-iterator.h @@ -0,0 +1,124 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ +#define ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ + +#include <jni.h> + +#include <queue> +#include <string> + +#include "icing/jni/jni-cache.h" +#include "icing/text_classifier/lib3/utils/java/jni-base.h" + +namespace icing { +namespace lib { + +// A class that handles the cross-JNI interactions with BreakIteratorBatcher and +// hides the batching element to provide an interface akin to +// java.text.BreakIterator. +// +// Example: +// std::string text = "我每天走路去上班。"; +// ASSERT_THAT(text, SizeIs(27)); +// std::unique_ptr<ReverseJniBreakIterator> itr = +// ReverseJniBreakIterator::Create(jni_cache, text, locale); +// std::vector<int> nexts; +// int next = itr->Next(); +// while (next != ReverseJniBreakIterator::kDone) { +// nexts.push_back(next); +// next = itr->Next(); +// } +// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8)); +class ReverseJniBreakIterator { + public: + static constexpr int kDone = -1; + + // Creates a ReverseJniBreakiterator with the given text and locale. + // + // Returns: + // A ReverseJniBreakIterator on success + // INVALID_ARGUMENT if jni_cache isn't a valid JniCache pointer + // INTERNAL if unable to create any of the required Java objects + static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>> + Create(const JniCache* jni_cache, std::string_view text, + std::string_view locale); + + // Returns the UTF-16 boundary following the current boundary. If the current + // boundary is the last text boundary, it returns + // ReverseJniBreakIterator::kDONE. + // + // NOTE: The 'boundary' refers to the UTF-16 boundary - NOT the UTF-8 + // boundary. Callers interested in the UTF-8 boundary are required to maintain + // whatever state is necessary to translate from UTF-16 to UTF-8 boundaries. + int Next(); + + // Returns the first UTF-16 boundary. The iterator's current position is set + // to the first text boundary and any cached data is cleared. + int First(); + + // Returns the position of the first UTF-16 boundary preceding the UTF-16 + // offset. If there is no boundary preceding the specified offset, then + // ReverseJniBreakIterator::kDone is returned. + // + // The iterator's current position is set to the segment whose boundary was + // returned and any cached data is cleared. + int Preceding(int offset); + + // Returns the position of the first UTF-16 boundary following the UTF-16 + // offset. If there is no boundary following the specified offset, then + // ReverseJniBreakIterator::kDone is returned. + // + // The iterator's current position is set to the segment whose boundary + // was returned and any cached data is cleared. + int Following(int offset); + + private: + ReverseJniBreakIterator( + const JniCache* jni_cache, + libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher); + + // Fetches the results of up to kBatchSize next calls and stores them in + // break_indices_cache_. Returns the number of results or kDone if no more + // results could be fetched. + int FetchNextBatch(); + + // Empties the cache and sets is_done_ and is_almost_done_ to false. + void ClearCache(); + + // Keeps track of references to Java classes and methods. Does NOT own. + const JniCache* jni_cache_; + + // The reference to the actual instance of BreakIteratorBatcher that + // this class interacts with. + libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher_; + + // The cache holding the most recent batch of return values from + // BreakIteratorBatcher#next. + std::queue<int> break_indices_cache_; + + bool is_done_; + + // The last batch was incomplete (< kBatchSize results were returned). The + // next call to BreakIteratorBatcher#next is guaranteed to return an + // empty array. Once the results from the last batch are evicted from + // break_indices_cache, ReverseJniBreakIterator will transition to is_done_. + bool is_almost_done_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc index 960d003..ee3d3a2 100644 --- a/icing/legacy/index/icing-dynamic-trie.cc +++ b/icing/legacy/index/icing-dynamic-trie.cc @@ -11,9 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -// Copyright 2011 Google Inc. All Rights Reserved. -// Author: ulas@google.com (Ulas Kirazci) // // We store the trie in three areas: nodes, nexts and suffixes. // @@ -84,7 +81,7 @@ #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-flash-bitmap.h" #include "icing/legacy/index/icing-mmapper.h" -#include "icing/util/icu-i18n-utils.h" +#include "icing/util/i18n-utils.h" #include "icing/util/logging.h" #include "icing/util/math-util.h" @@ -250,6 +247,11 @@ class IcingDynamicTrie::IcingDynamicTrieStorage { const IcingFilesystem &filesystem); bool Sync(); uint64_t GetDiskUsage() const; + + // Returns the size of the elements held in the trie. This excludes the size + // of any internal metadata of the trie, e.g. the trie's header. + uint64_t GetElementsFileSize() const; + void Warm(); void Clear(); @@ -696,6 +698,18 @@ uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetDiskUsage() const { return total; } +uint64_t IcingDynamicTrie::IcingDynamicTrieStorage::GetElementsFileSize() + const { + // Trie files themselves, exclude size of the header. These arrays are dense, + // not sparse, so use file size for more accurate numbers. + uint64_t total = 0; + for (int i = 0; i < NUM_ARRAY_TYPES; i++) { + IcingFilesystem::IncrementByOrSetInvalid( + filesystem_->GetFileSize(array_fds_[i].get()), &total); + } + return total; +} + IcingDynamicTrie::Node *IcingDynamicTrie::IcingDynamicTrieStorage::AllocNode() { if (nodes_left() == 0) { ICING_LOG(FATAL) << "No allocated nodes left"; @@ -1154,6 +1168,30 @@ uint64_t IcingDynamicTrie::GetDiskUsage() const { return total; } +uint64_t IcingDynamicTrie::GetElementsSize() const { + uint64_t total = 0; + + // Bitmaps are sparsely populated, so disk usage is more accurate for those. + // Property bitmaps. + IcingFilesystem::IncrementByOrSetInvalid(deleted_bitmap_->GetDiskUsage(), + &total); + // The deleted bitmap is always initially grown to kGrowSize, whether there + // are elements or not. So even if there are no elements in the trie, we'll + // still have the bitmap of size kGrowSize, so subtract that from the size of + // the trie's elements. + total -= IcingFlashBitmap::kGrowSize; + + for (auto &bitmap : property_bitmaps_) { + if (bitmap == nullptr) continue; + IcingFilesystem::IncrementByOrSetInvalid(bitmap->GetDiskUsage(), &total); + } + + // Storage. We can use file size here since the storage files aren't sparse. + IcingFilesystem::IncrementByOrSetInvalid(storage_->GetElementsFileSize(), + &total); + return total; +} + std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap( const std::string &filename, bool verify, const IcingFilesystem *filesystem) { @@ -1868,7 +1906,7 @@ void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() { // If we start with non-ascii, take all left branches while there is // a continuation byte. - if (!icu_i18n_utils::IsAscii(cur_[cur_len_ - 1])) { + if (!i18n_utils::IsAscii(cur_[cur_len_ - 1])) { while (!node->is_leaf()) { if (cur_len_ >= U8_MAX_LENGTH) break; @@ -1877,8 +1915,8 @@ void IcingDynamicTrie::Utf8Iterator::LeftBranchToUtf8End() { if (branch_end_->child->val() == 0) { // Check if we already have a valid cur_. cur_[cur_len_] = 0; - UChar32 uchar32 = icu_i18n_utils::GetUChar32At(cur_, cur_len_, 0); - if (uchar32 == icu_i18n_utils::kInvalidUChar32 && + UChar32 uchar32 = i18n_utils::GetUChar32At(cur_, cur_len_, 0); + if (uchar32 == i18n_utils::kInvalidUChar32 && node->log2_num_children() > 0) { branch_end_->child++; } else { diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h index 6b39c56..7136ef8 100644 --- a/icing/legacy/index/icing-dynamic-trie.h +++ b/icing/legacy/index/icing-dynamic-trie.h @@ -48,7 +48,8 @@ #include "icing/legacy/index/icing-mmapper.h" #include "icing/legacy/index/icing-storage.h" #include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h" -#include "icing/util/icu-i18n-utils.h" +#include "icing/util/i18n-utils.h" +#include "unicode/utf8.h" namespace icing { namespace lib { @@ -265,6 +266,10 @@ class IcingDynamicTrie : public IIcingStorage { bool Remove() override; uint64_t GetDiskUsage() const override; + // Returns the size of the elements held in the trie. This excludes the size + // of any internal metadata of the trie, e.g. the trie's header. + uint64_t GetElementsSize() const; + // REQUIRED: For all functions below is_initialized() == true. // Number of keys in trie. diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h index 9abd369..3b3521a 100644 --- a/icing/legacy/index/icing-flash-bitmap.h +++ b/icing/legacy/index/icing-flash-bitmap.h @@ -11,9 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -// Copyright 2012 Google Inc. All Rights Reserved. -// Author: ulas@google.com (Ulas Kirazci) // // A disk-backed bitmap. // diff --git a/icing/proto/document.proto b/icing/proto/document.proto index 0a8b6f8..bed33b0 100644 --- a/icing/proto/document.proto +++ b/icing/proto/document.proto @@ -20,6 +20,7 @@ import "icing/proto/status.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; // Defines a unit of data understood by the IcingSearchEngine. // Next tag: 9 @@ -108,6 +109,7 @@ message PutResultProto { message GetResultProto { // Status code can be one of: // OK + // FAILED_PRECONDITION // NOT_FOUND // INTERNAL // @@ -127,6 +129,7 @@ message GetResultProto { message DeleteResultProto { // Status code can be one of: // OK + // FAILED_PRECONDITION // NOT_FOUND // INTERNAL // @@ -142,6 +145,7 @@ message DeleteResultProto { message DeleteByNamespaceResultProto { // Status code can be one of: // OK + // FAILED_PRECONDITION // NOT_FOUND // INTERNAL // @@ -157,6 +161,7 @@ message DeleteByNamespaceResultProto { message DeleteBySchemaTypeResultProto { // Status code can be one of: // OK + // FAILED_PRECONDITION // NOT_FOUND // INTERNAL // diff --git a/icing/proto/document_wrapper.proto b/icing/proto/document_wrapper.proto index 0666e72..e8eb992 100644 --- a/icing/proto/document_wrapper.proto +++ b/icing/proto/document_wrapper.proto @@ -21,6 +21,8 @@ import "icing/proto/document.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + // DocumentWrapper as a wrapper of the user-facing DocumentProto is meant to // be used by icing team internally. It stores the original document provided // by library users and metadata of the document which shouldn't be exposed to diff --git a/icing/proto/initialize.proto b/icing/proto/initialize.proto index 813cdb5..eac88e6 100644 --- a/icing/proto/initialize.proto +++ b/icing/proto/initialize.proto @@ -21,6 +21,8 @@ import "icing/proto/status.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + // Next tag: 5 message IcingSearchEngineOptions { // Directory to persist files for Icing. Required. diff --git a/icing/proto/optimize.proto b/icing/proto/optimize.proto index 2bf28e8..1baa64c 100644 --- a/icing/proto/optimize.proto +++ b/icing/proto/optimize.proto @@ -20,12 +20,14 @@ import "icing/proto/status.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; // Result of a call to IcingSearchEngine.Optimize // Next tag: 2 message OptimizeResultProto { // Status code can be one of: // OK + // FAILED_PRECONDITION // WARNING_DATA_LOSS // ABORTED // INTERNAL @@ -36,3 +38,23 @@ message OptimizeResultProto { // TODO(b/147699081): Add a field to indicate lost_schema and lost_documents. // go/icing-library-apis. } + +// Result of a call to IcingSearchEngine.GetOptimizeInfo +// Next tag: 4 +message GetOptimizeInfoResultProto { + // Status code can be one of: + // OK + // FAILED_PRECONDITION + // INTERNAL + // + // See status.proto for more details. + optional StatusProto status = 1; + + // Documents that have expired or been deleted, but are still taking up space + // in IcingSearchEngine. + optional int64 optimizable_docs = 2; + + // Estimated bytes that could be recovered. The exact size per document isn't + // tracked, so this is based off an average document size. + optional int64 estimated_optimizable_bytes = 3; +} diff --git a/icing/proto/persist.proto b/icing/proto/persist.proto index 5b5a737..77cf987 100644 --- a/icing/proto/persist.proto +++ b/icing/proto/persist.proto @@ -20,12 +20,14 @@ import "icing/proto/status.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; // Result of a call to IcingSearchEngine.Persist // Next tag: 2 message PersistToDiskResultProto { // Status code can be one of: // OK + // FAILED_PRECONDITION // INTERNAL // // See status.proto for more details. diff --git a/icing/proto/reset.proto b/icing/proto/reset.proto index 9a7fa9a..5e8b9f5 100644 --- a/icing/proto/reset.proto +++ b/icing/proto/reset.proto @@ -21,6 +21,8 @@ import "icing/proto/status.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + // Result of a call to IcingSearchEngine.Reset // Next tag: 2 message ResetResultProto { diff --git a/icing/proto/schema.proto b/icing/proto/schema.proto index cabccaa..3a7ee5d 100644 --- a/icing/proto/schema.proto +++ b/icing/proto/schema.proto @@ -21,6 +21,7 @@ import "icing/proto/term.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; // Defines the schema that every Document of a specific "type" should adhere // to. These can be considered as definitions of rich structured types for @@ -204,6 +205,7 @@ message SetSchemaResultProto { message GetSchemaResultProto { // Status code can be one of: // OK + // FAILED_PRECONDITION // NOT_FOUND // INTERNAL // diff --git a/icing/proto/scoring.proto b/icing/proto/scoring.proto index ad536b4..667ff4f 100644 --- a/icing/proto/scoring.proto +++ b/icing/proto/scoring.proto @@ -19,6 +19,8 @@ package icing.lib; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + // Encapsulates the configurations on how Icing should score and rank the search // results. // Next tag: 3 @@ -26,9 +28,8 @@ message ScoringSpecProto { // OPTIONAL: Indicates how the search results will be ranked. message RankingStrategy { enum Code { - // No ranking strategy specified, documents will be returned in the - // default order that the most recent document inserted into Icing comes - // first. + // No ranking strategy specified, documents may be returned in an + // arbitrary order. NONE = 0; // Ranked by user-provided document scores. diff --git a/icing/proto/search.proto b/icing/proto/search.proto index 085575a..8ea5036 100644 --- a/icing/proto/search.proto +++ b/icing/proto/search.proto @@ -22,6 +22,7 @@ import "icing/proto/term.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; // Client-supplied specifications on what documents to retrieve. // Next tag: 5 @@ -148,6 +149,7 @@ message SnippetProto { message SearchResultProto { // Status code can be one of: // OK + // FAILED_PRECONDITION // INVALID_ARGUMENT // ABORTED // INTERNAL diff --git a/icing/proto/status.proto b/icing/proto/status.proto index 418b2e8..2733a15 100644 --- a/icing/proto/status.proto +++ b/icing/proto/status.proto @@ -19,6 +19,8 @@ package icing.lib; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + // Canonical status to indicate the results of API calls. // Next tag: 3 message StatusProto { diff --git a/icing/proto/term.proto b/icing/proto/term.proto index 30cd1bc..adf2ad6 100644 --- a/icing/proto/term.proto +++ b/icing/proto/term.proto @@ -19,6 +19,8 @@ package icing.lib; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + // Encapsulates the configurations on how Icing should query/index these terms. // Next tag: 0 message TermMatchType { diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index 5775e83..000bf3a 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -16,7 +16,7 @@ #include "gmock/gmock.h" #include "third_party/absl/flags/flag.h" #include "icing/document-builder.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index.h" #include "icing/proto/term.pb.h" #include "icing/query/query-processor.h" @@ -80,7 +80,7 @@ std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem, std::unique_ptr<Normalizer> CreateNormalizer() { return normalizer_factory::Create( - normalizer_factory::NormalizerType::ICU4C, + /*max_term_byte_size=*/std::numeric_limits<int>::max()) .ValueOrDie(); } @@ -108,8 +108,7 @@ void BM_QueryOneTerm(benchmark::State& state) { std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); FakeClock fake_clock; @@ -221,8 +220,7 @@ void BM_QueryFiveTerms(benchmark::State& state) { std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); FakeClock fake_clock; @@ -352,8 +350,7 @@ void BM_QueryDiacriticTerm(benchmark::State& state) { std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); FakeClock fake_clock; @@ -468,8 +465,7 @@ void BM_QueryHiragana(benchmark::State& state) { std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); FakeClock fake_clock; diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index 99a552e..7dfc326 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -22,7 +22,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" @@ -102,14 +102,11 @@ class QueryProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &icing_filesystem_)); - ICING_ASSERT_OK_AND_ASSIGN( - language_segmenter_, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_, + language_segmenter_factory::Create()); - ICING_ASSERT_OK_AND_ASSIGN( - normalizer_, - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/1000)); + ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); } libtextclassifier3::Status AddTokenToIndex( diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index cfce6e2..36dbfd9 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -20,7 +20,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" @@ -59,16 +59,13 @@ class ResultRetrieverTest : public testing::Test { // File generated via icu_data_file rule in //icing/BUILD. icu_data_file_helper::SetUpICUDataFile( GetTestFilePath("icing/icu.dat"))); - ICING_ASSERT_OK_AND_ASSIGN( - language_segmenter_, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(schema_store_, SchemaStore::Create(&filesystem_, test_dir_)); - ICING_ASSERT_OK_AND_ASSIGN( - normalizer_, - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/10000)); + ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( + /*max_term_byte_size=*/10000)); SchemaProto schema; auto type_config = schema.add_types(); diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index faf9e18..09d0f7a 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -35,7 +35,7 @@ #include "icing/tokenization/tokenizer-factory.h" #include "icing/tokenization/tokenizer.h" #include "icing/transform/normalizer.h" -#include "icing/util/icu-i18n-utils.h" +#include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" namespace icing { @@ -126,19 +126,18 @@ libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher( // Returns true if token matches any of the terms in query terms according to // the provided match type. - +// // Returns: // the position of the window start if successful // INTERNAL_ERROR - if a tokenizer error is encountered libtextclassifier3::StatusOr<int> DetermineWindowStart( const ResultSpecProto::SnippetSpecProto& snippet_spec, std::string_view value, int match_mid, Tokenizer::Iterator* iterator) { - int window_start_min = - std::max((match_mid - snippet_spec.max_window_bytes() / 2), 0); - if (window_start_min == 0) { + int window_start_min = (match_mid - snippet_spec.max_window_bytes() / 2) - 1; + if (window_start_min < 0) { return 0; } - if (!iterator->ResetToTokenAfter(window_start_min - 1)) { + if (!iterator->ResetToTokenAfter(window_start_min)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } @@ -152,8 +151,7 @@ int IncludeTrailingPunctuation(std::string_view value, int window_end_exclusive, int window_end_max_exclusive) { while (window_end_exclusive < window_end_max_exclusive) { int char_len = 0; - if (!icu_i18n_utils::IsPunctuationAt(value, window_end_exclusive, - &char_len)) { + if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive, &char_len)) { break; } if (window_end_exclusive + char_len > window_end_max_exclusive) { @@ -174,10 +172,9 @@ libtextclassifier3::StatusOr<int> DetermineWindowEnd( const ResultSpecProto::SnippetSpecProto& snippet_spec, std::string_view value, int match_mid, Tokenizer::Iterator* iterator) { int window_end_max_exclusive = - std::min((match_mid + snippet_spec.max_window_bytes() / 2), - static_cast<int>(value.length())); - if (window_end_max_exclusive == value.length()) { - return window_end_max_exclusive; + match_mid + snippet_spec.max_window_bytes() / 2; + if (window_end_max_exclusive >= value.length()) { + return value.length(); } if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) { return absl_ports::InternalError( @@ -228,8 +225,11 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( iterator)); snippet_match.set_window_bytes(window_end_exclusive - window_start); - // Reset the iterator back to the original position. - if (!iterator->ResetToTokenAfter(match_pos - 1)) { + // DetermineWindowStart/End may change the position of the iterator. So, + // reset the iterator back to the original position. + bool success = (match_pos > 0) ? iterator->ResetToTokenAfter(match_pos - 1) + : iterator->ResetToStart(); + if (!success) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index 7037ede..3b3bf61 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -22,7 +22,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" @@ -60,9 +60,8 @@ class SnippetRetrieverTest : public testing::Test { // File generated via icu_data_file rule in //icing/BUILD. icu_data_file_helper::SetUpICUDataFile( GetTestFilePath("icing/icu.dat"))); - ICING_ASSERT_OK_AND_ASSIGN( - language_segmenter_, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_, + language_segmenter_factory::Create()); // Setup the schema ICING_ASSERT_OK_AND_ASSIGN(schema_store_, @@ -88,10 +87,8 @@ class SnippetRetrieverTest : public testing::Test { IndexingConfig::TokenizerType::PLAIN); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); - ICING_ASSERT_OK_AND_ASSIGN( - normalizer_, - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/10000)); + ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( + /*max_term_byte_size=*/10000)); ICING_ASSERT_OK_AND_ASSIGN( snippet_retriever_, SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index df5a820..7413d73 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -256,23 +256,29 @@ void SchemaUtil::BuildTypeConfigMap( } } -void SchemaUtil::BuildPropertyConfigMap( - const SchemaTypeConfigProto& type_config, - std::unordered_map<std::string_view, const PropertyConfigProto*>* - property_config_map, - int32_t* num_required_properties) { +SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs( + const SchemaTypeConfigProto& type_config) { + ParsedPropertyConfigs parsed_property_configs; + // TODO(samzheng): consider caching property_config_map for some properties, // e.g. using LRU cache. Or changing schema.proto to use go/protomap. - *num_required_properties = 0; - property_config_map->clear(); for (const PropertyConfigProto& property_config : type_config.properties()) { - property_config_map->emplace(property_config.property_name(), - &property_config); + parsed_property_configs.property_config_map.emplace( + property_config.property_name(), &property_config); if (property_config.cardinality() == PropertyConfigProto::Cardinality::REQUIRED) { - (*num_required_properties)++; + parsed_property_configs.num_required_properties++; + } + + // A non-default term_match_type indicates that this property is meant to be + // indexed. + if (property_config.indexing_config().term_match_type() != + TermMatchType::UNKNOWN) { + parsed_property_configs.num_indexed_properties++; } } + + return parsed_property_configs; } const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( @@ -298,22 +304,21 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( continue; } - std::unordered_map<std::string_view, const PropertyConfigProto*> - new_property_map; - int32_t new_required_properties = 0; - BuildPropertyConfigMap(new_schema_type_and_config->second, - &new_property_map, &new_required_properties); + ParsedPropertyConfigs new_parsed_property_configs = + ParsePropertyConfigs(new_schema_type_and_config->second); // We only need to check the old, existing properties to see if they're // compatible since we'll have old data that may be invalidated or need to - // be reindexed. New properties don't have any data that would be - // invalidated or incompatible, so we blanket accept all new properties. + // be reindexed. int32_t old_required_properties = 0; + int32_t old_indexed_properties = 0; for (const auto& old_property_config : old_type_config.properties()) { auto new_property_name_and_config = - new_property_map.find(old_property_config.property_name()); + new_parsed_property_configs.property_config_map.find( + old_property_config.property_name()); - if (new_property_name_and_config == new_property_map.end()) { + if (new_property_name_and_config == + new_parsed_property_configs.property_config_map.end()) { // Didn't find the old property ICING_VLOG(1) << absl_ports::StrCat("Previously defined property type ", old_type_config.schema_type(), ".", @@ -340,6 +345,13 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( ++old_required_properties; } + // A non-default term_match_type indicates that this property is meant to + // be indexed. + if (old_property_config.indexing_config().term_match_type() != + TermMatchType::UNKNOWN) { + ++old_indexed_properties; + } + // Any change in the indexed property requires a reindexing if (!IsTermMatchTypeCompatible(old_property_config.indexing_config(), new_property_config->indexing_config())) { @@ -352,7 +364,8 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( // guaranteed from our previous checks that all the old properties are also // present in the new property config, so we can do a simple int comparison // here to detect new required properties. - if (new_required_properties > old_required_properties) { + if (new_parsed_property_configs.num_required_properties > + old_required_properties) { ICING_VLOG(1) << absl_ports::StrCat( "New schema ", old_type_config.schema_type(), " has REQUIRED properties that are not " @@ -360,6 +373,18 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( schema_delta.schema_types_incompatible.insert( old_type_config.schema_type()); } + + // If we've gained any new indexed properties, then the section ids may + // change. Since the section ids are stored in the index, we'll need to + // reindex everything. + if (new_parsed_property_configs.num_indexed_properties > + old_indexed_properties) { + ICING_VLOG(1) << absl_ports::StrCat( + "Set of indexed properties in schema type '", + old_type_config.schema_type(), + "' has changed, required reindexing."); + schema_delta.index_incompatible = true; + } } return schema_delta; diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h index c547ad2..d65dd10 100644 --- a/icing/schema/schema-util.h +++ b/icing/schema/schema-util.h @@ -54,6 +54,18 @@ class SchemaUtil { } }; + struct ParsedPropertyConfigs { + // Mapping of property name to PropertyConfigProto + std::unordered_map<std::string_view, const PropertyConfigProto*> + property_config_map; + + // Total number of properties that have an indexing config + int32_t num_indexed_properties = 0; + + // Total number of properties that were REQUIRED + int32_t num_required_properties = 0; + }; + // This function validates: // 1. SchemaTypeConfigProto.schema_type's must be unique // 2. Properties within one SchemaTypeConfigProto must be unique @@ -81,14 +93,10 @@ class SchemaUtil { static void BuildTypeConfigMap(const SchemaProto& schema, TypeConfigMap* type_config_map); - // Calculate and return a hash map of (property name -> property config) - // from the given type config. The number of required properties will be - // assigned to output param num_required_properties. - static void BuildPropertyConfigMap( - const SchemaTypeConfigProto& type_config, - std::unordered_map<std::string_view, const PropertyConfigProto*>* - property_config_map, - int32_t* num_required_properties); + // Parses the given type_config and returns a struct of easily-parseable + // information about the properties. + static ParsedPropertyConfigs ParsePropertyConfigs( + const SchemaTypeConfigProto& type_config); // Computes the delta between the old and new schema. There are a few // differences that'll be reported: diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index 64473b8..a3ab96f 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -502,6 +502,40 @@ TEST_F(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { Eq(schema_delta)); } +TEST_F(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) { + // Configure old schema + SchemaProto old_schema; + auto old_type = old_schema.add_types(); + *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType); + + auto old_property = old_type->add_properties(); + old_property->set_property_name("Property"); + old_property->set_data_type(PropertyConfigProto::DataType::STRING); + old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + // Configure new schema + SchemaProto new_schema; + auto new_type = new_schema.add_types(); + *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType); + + auto new_property = new_type->add_properties(); + new_property->set_property_name("Property"); + new_property->set_data_type(PropertyConfigProto::DataType::STRING); + new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + new_property = new_type->add_properties(); + new_property->set_property_name("NewIndexedProperty"); + new_property->set_data_type(PropertyConfigProto::DataType::STRING); + new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + new_property->mutable_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.index_incompatible = true; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + Eq(schema_delta)); +} + TEST_F(SchemaUtilTest, AddingTypeIsCompatible) { // Can add a new type, existing data isn't incompatible, since none of them // are of this new schema type diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index e2457d0..ae8360b 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -1235,6 +1235,59 @@ libtextclassifier3::Status DocumentStore::OptimizeInto( return libtextclassifier3::Status::OK; } +libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo> +DocumentStore::GetOptimizeInfo() const { + OptimizeInfo optimize_info; + + // Figure out our ratio of optimizable/total docs. + int32_t num_documents = document_id_mapper_->num_elements(); + for (DocumentId document_id = kMinDocumentId; document_id < num_documents; + ++document_id) { + if (!DoesDocumentExist(document_id)) { + ++optimize_info.optimizable_docs; + } + + ++optimize_info.total_docs; + } + + if (optimize_info.total_docs == 0) { + // Can exit early since there's nothing to calculate. + return optimize_info; + } + + // Get the total element size. + // + // We use file size instead of disk usage here because the files are not + // sparse, so it's more accurate. Disk usage rounds up to the nearest block + // size. + ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size, + document_log_->GetElementsFileSize()); + ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size, + document_id_mapper_->GetElementsFileSize()); + ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size, + score_cache_->GetElementsFileSize()); + ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size, + filter_cache_->GetElementsFileSize()); + + // We use a combined disk usage and file size for the KeyMapper because it's + // backed by a trie, which has some sparse property bitmaps. + ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size, + document_key_mapper_->GetElementsSize()); + + // We don't include the namespace mapper because it's not clear if we could + // recover any space even if Optimize were called. Deleting 100s of documents + // could still leave a few documents of a namespace, and then there would be + // no change. + + int64_t total_size = document_log_file_size + document_key_mapper_size + + document_id_mapper_file_size + score_cache_file_size + + filter_cache_file_size; + + optimize_info.estimated_optimizable_bytes = + total_size * optimize_info.optimizable_docs / optimize_info.total_docs; + return optimize_info; +} + libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache( DocumentId document_id, const DocumentAssociatedScoreData& score_data) { return score_cache_->Set(document_id, score_data); diff --git a/icing/store/document-store.h b/icing/store/document-store.h index 891b199..3547214 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -54,6 +54,20 @@ class DocumentStore { uint32_t checksum; }; + struct OptimizeInfo { + // The estimated size in bytes of the optimizable docs. We don't track the + // size of each document, so we estimate by taking the size of the entire + // DocumentStore and dividing that by the total number of documents we have. + // So we end up with an average document size. + int64_t estimated_optimizable_bytes = 0; + + // Number of total documents the DocumentStore tracks. + int32_t total_docs = 0; + + // Number of optimizable (deleted + expired) docs the DocumentStore tracks. + int32_t optimizable_docs = 0; + }; + // Not copyable DocumentStore(const DocumentStore&) = delete; DocumentStore& operator=(const DocumentStore&) = delete; @@ -208,7 +222,8 @@ class DocumentStore { // INTERNAL on I/O error libtextclassifier3::Status PersistToDisk(); - // Calculates and returns the disk usage in bytes. + // Calculates and returns the disk usage in bytes. Rounds up to the nearest + // block size. // // Returns: // Disk usage on success @@ -273,6 +288,15 @@ class DocumentStore { // INTERNAL_ERROR on IO error libtextclassifier3::Status OptimizeInto(const std::string& new_directory); + // Calculates status for a potential Optimize call. Includes how many docs + // there are vs how many would be optimized away. And also includes an + // estimated size gains, in bytes, if Optimize were called. + // + // Returns: + // OptimizeInfo on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const; + // Computes the combined checksum of the document store - includes the ground // truth and all derived files. // diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index 5ec062f..f59d2e2 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -1966,5 +1966,53 @@ TEST_F(DocumentStoreTest, IsOkAndHolds(EqualsProto(message_document))); } +TEST_F(DocumentStoreTest, GetOptimizeInfo) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> document_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + + // Nothing should be optimizable yet + ICING_ASSERT_OK_AND_ASSIGN(DocumentStore::OptimizeInfo optimize_info, + document_store->GetOptimizeInfo()); + EXPECT_THAT(optimize_info.total_docs, Eq(0)); + EXPECT_THAT(optimize_info.optimizable_docs, Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0)); + + ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_))); + + // Adding a document, still nothing is optimizable + ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo()); + EXPECT_THAT(optimize_info.total_docs, Eq(1)); + EXPECT_THAT(optimize_info.optimizable_docs, Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0)); + + // Delete a document. Now something is optimizable + ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), + test_document1_.uri())); + ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo()); + EXPECT_THAT(optimize_info.total_docs, Eq(1)); + EXPECT_THAT(optimize_info.optimizable_docs, Eq(1)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Gt(0)); + + // Optimize it into a different directory, should bring us back to nothing + // since all documents were optimized away. + std::string optimized_dir = document_store_dir_ + "_optimize"; + EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); + EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); + ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir)); + document_store.reset(); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> optimized_document_store, + DocumentStore::Create(&filesystem_, optimized_dir, &fake_clock_, + schema_store_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(optimize_info, + optimized_document_store->GetOptimizeInfo()); + EXPECT_THAT(optimize_info.total_docs, Eq(0)); + EXPECT_THAT(optimize_info.optimizable_docs, Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0)); +} + } // namespace lib } // namespace icing diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h index b01a8f1..a85b00d 100644 --- a/icing/store/key-mapper.h +++ b/icing/store/key-mapper.h @@ -99,13 +99,23 @@ class KeyMapper { // INTERNAL on I/O error libtextclassifier3::Status PersistToDisk(); - // Calculates and returns the disk usage in bytes. + // Calculates and returns the disk usage in bytes. Rounds up to the nearest + // block size. // // Returns: // Disk usage on success // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + // Returns the size of the elements held in the key mapper. This excludes the + // size of any internal metadata of the key mapper, e.g. the key mapper's + // header. + // + // Returns: + // File size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; + // Computes and returns the checksum of the header and contents. Crc32 ComputeChecksum(); @@ -261,6 +271,16 @@ libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetDiskUsage() const { } template <typename T> +libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetElementsSize() const { + int64_t size = trie_.GetElementsSize(); + if (size == IcingFilesystem::kBadFileSize || size < 0) { + return absl_ports::InternalError( + "Failed to get disk usage of elements in the key mapper"); + } + return size; +} + +template <typename T> Crc32 KeyMapper<T>::ComputeChecksum() { return Crc32(trie_.UpdateCrc()); } diff --git a/icing/testing/logging-event-listener.cc b/icing/testing/logging-event-listener.cc new file mode 100644 index 0000000..4b42825 --- /dev/null +++ b/icing/testing/logging-event-listener.cc @@ -0,0 +1,121 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/testing/logging-event-listener.h" + +#include "icing/legacy/core/icing-string-util.h" +#include "icing/util/logging.h" + +namespace icing { +namespace lib { + +void LoggingEventListener::OnTestProgramStart( + const testing::UnitTest& /* unit_test */) {} + +void LoggingEventListener::OnTestIterationStart( + const testing::UnitTest& unit_test, int iteration) { + ICING_LOG(INFO) << "[==========] Running " << unit_test.test_to_run_count() + << " test(s) from " << unit_test.test_case_to_run_count() + << " test case(s)"; +} + +void LoggingEventListener::OnEnvironmentsSetUpStart( + const testing::UnitTest& unit_test) { + ICING_LOG(INFO) << "[----------] Global test environment set-up."; +} + +void LoggingEventListener::OnEnvironmentsSetUpEnd( + const testing::UnitTest& /* unit_test */) {} + +void LoggingEventListener::OnTestCaseStart(const testing::TestCase& test_case) { + std::string param_text; + if (test_case.type_param()) { + param_text = IcingStringUtil::StringPrintf(", where TypeParam = %s", + test_case.type_param()); + } + ICING_LOG(INFO) << "[----------] " << test_case.test_to_run_count() + << " test(s) from " << test_case.name() << param_text; +} + +void LoggingEventListener::OnTestStart(const testing::TestInfo& test_info) { + ICING_LOG(INFO) << "[ RUN ] " << test_info.test_case_name() << "." + << test_info.name(); +} + +void LoggingEventListener::OnTestPartResult( + const testing::TestPartResult& test_part_result) { + if (test_part_result.type() != testing::TestPartResult::kSuccess) { + ICING_LOG(ERROR) << test_part_result.file_name() << ":" + << test_part_result.line_number() << ": Failure " + << test_part_result.message(); + } +} + +void LoggingEventListener::OnTestEnd(const testing::TestInfo& test_info) { + if (test_info.result()->Passed()) { + ICING_LOG(INFO) << "[ OK ] " << test_info.test_case_name() << "." + << test_info.name(); + } else { + ICING_LOG(ERROR) << "[ FAILED ] " << test_info.test_case_name() << "." + << test_info.name(); + } +} + +void LoggingEventListener::OnTestCaseEnd(const testing::TestCase& test_case) { + ICING_LOG(INFO) << "[----------] " << test_case.test_to_run_count() + << " test(s) from " << test_case.name() << " (" + << test_case.elapsed_time() << " ms total)"; +} + +void LoggingEventListener::OnEnvironmentsTearDownStart( + const testing::UnitTest& unit_test) { + ICING_LOG(INFO) << "[----------] Global test environment tear-down."; +} + +void LoggingEventListener::OnEnvironmentsTearDownEnd( + const testing::UnitTest& /* unit_test */) {} + +void LoggingEventListener::OnTestIterationEnd( + const testing::UnitTest& unit_test, int iteration) { + ICING_LOG(INFO) << "[==========] " << unit_test.test_to_run_count() + << " test(s) from " << unit_test.test_case_to_run_count() + << " test case(s) ran. (" << unit_test.elapsed_time() + << " ms total)"; + ICING_LOG(INFO) << "[ PASSED ] " << unit_test.successful_test_count() + << " test(s)"; + if (!unit_test.Passed()) { + ICING_LOG(ERROR) << "[ FAILED ] " << unit_test.failed_test_count() + << " test(s), listed below:"; + for (int i = 0; i < unit_test.total_test_case_count(); ++i) { + const testing::TestCase& test_case = *unit_test.GetTestCase(i); + if (!test_case.should_run() || (test_case.failed_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_case.total_test_count(); ++j) { + const testing::TestInfo& test_info = *test_case.GetTestInfo(j); + if (!test_info.should_run() || test_info.result()->Passed()) { + continue; + } + ICING_LOG(ERROR) << "[ FAILED ] " << test_case.name() << "." + << test_info.name(); + } + } + } +} + +void LoggingEventListener::OnTestProgramEnd( + const testing::UnitTest& /* unit_test */) {} + +} // namespace lib +} // namespace icing diff --git a/icing/testing/logging-event-listener.h b/icing/testing/logging-event-listener.h new file mode 100644 index 0000000..8024222 --- /dev/null +++ b/icing/testing/logging-event-listener.h @@ -0,0 +1,62 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TESTING_LOGGING_EVENT_LISTENER_H_ +#define ICING_TESTING_LOGGING_EVENT_LISTENER_H_ + +#include "gtest/gtest.h" + +namespace icing { +namespace lib { + +// TestEventListener that writes test results to the log so that they will be +// visible in the logcat output in Sponge. +// The formatting of the output is patterned after the output produced by the +// standard PrettyUnitTestResultPrinter. +class LoggingEventListener : public ::testing::TestEventListener { + public: + void OnTestProgramStart(const testing::UnitTest& unit_test) override; + + void OnTestIterationStart(const testing::UnitTest& unit_test, + int iteration) override; + + void OnEnvironmentsSetUpStart(const testing::UnitTest& unit_test) override; + + void OnEnvironmentsSetUpEnd(const testing::UnitTest& unit_test) override; + + void OnTestCaseStart(const testing::TestCase& test_case) override; + + void OnTestStart(const testing::TestInfo& test_info) override; + + void OnTestPartResult( + const testing::TestPartResult& test_part_result) override; + + void OnTestEnd(const testing::TestInfo& test_info) override; + + void OnTestCaseEnd(const testing::TestCase& test_case) override; + + void OnEnvironmentsTearDownStart(const testing::UnitTest& unit_test) override; + + void OnEnvironmentsTearDownEnd(const testing::UnitTest& unit_test) override; + + void OnTestIterationEnd(const testing::UnitTest& unit_test, + int iteration) override; + + void OnTestProgramEnd(const testing::UnitTest& unit_test) override; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_TESTING_LOGGING_EVENT_LISTENER_H_ diff --git a/icing/text_classifier/lib3/utils/java/jni-base.cc b/icing/text_classifier/lib3/utils/java/jni-base.cc new file mode 100644 index 0000000..3b6d09e --- /dev/null +++ b/icing/text_classifier/lib3/utils/java/jni-base.cc @@ -0,0 +1,44 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/text_classifier/lib3/utils/java/jni-base.h" + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/java/string_utils.h" + +namespace libtextclassifier3 { + +bool EnsureLocalCapacity(JNIEnv* env, int capacity) { + return env->EnsureLocalCapacity(capacity) == JNI_OK; +} + +bool JniExceptionCheckAndClear(JNIEnv* env) { + TC3_CHECK(env != nullptr); + const bool result = env->ExceptionCheck(); + if (result) { + env->ExceptionDescribe(); + env->ExceptionClear(); + } + return result; +} + +StatusOr<std::string> ToStlString(JNIEnv* env, const jstring& str) { + std::string result; + if (!JStringToUtf8String(env, str, &result)) { + return {Status::UNKNOWN}; + } + return result; +} + +} // namespace libtextclassifier3 diff --git a/icing/text_classifier/lib3/utils/java/jni-base.h b/icing/text_classifier/lib3/utils/java/jni-base.h new file mode 100644 index 0000000..7fd612a --- /dev/null +++ b/icing/text_classifier/lib3/utils/java/jni-base.h @@ -0,0 +1,217 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_ +#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_ + +#include <jni.h> + +#include <string> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" + +// When we use a macro as an argument for a macro, an additional level of +// indirection is needed, if the macro argument is used with # or ##. +#define TC3_ADD_QUOTES_HELPER(TOKEN) #TOKEN +#define TC3_ADD_QUOTES(TOKEN) TC3_ADD_QUOTES_HELPER(TOKEN) + +#ifndef TC3_PACKAGE_NAME +#define TC3_PACKAGE_NAME com_google_knowledge_cerebra_sense_textclassifier_lib3 +#endif + +#ifndef TC3_PACKAGE_PATH +#define TC3_PACKAGE_PATH \ + "com/google/knowledge/cerebra/sense/textclassifier/lib3/" +#endif + +#define TC3_JNI_METHOD_NAME_INTERNAL(package_name, class_name, method_name) \ + Java_##package_name##_##class_name##_##method_name + +#define TC3_JNI_METHOD_PRIMITIVE(return_type, package_name, class_name, \ + method_name) \ + JNIEXPORT return_type JNICALL TC3_JNI_METHOD_NAME_INTERNAL( \ + package_name, class_name, method_name) + +// The indirection is needed to correctly expand the TC3_PACKAGE_NAME macro. +// See the explanation near TC3_ADD_QUOTES macro. +#define TC3_JNI_METHOD2(return_type, package_name, class_name, method_name) \ + TC3_JNI_METHOD_PRIMITIVE(return_type, package_name, class_name, method_name) + +#define TC3_JNI_METHOD(return_type, class_name, method_name) \ + TC3_JNI_METHOD2(return_type, TC3_PACKAGE_NAME, class_name, method_name) + +#define TC3_JNI_METHOD_NAME2(package_name, class_name, method_name) \ + TC3_JNI_METHOD_NAME_INTERNAL(package_name, class_name, method_name) + +#define TC3_JNI_METHOD_NAME(class_name, method_name) \ + TC3_JNI_METHOD_NAME2(TC3_PACKAGE_NAME, class_name, method_name) + +namespace libtextclassifier3 { + +// Returns true if the requested capacity is available. +bool EnsureLocalCapacity(JNIEnv* env, int capacity); + +// Returns true if there was an exception. Also it clears the exception. +bool JniExceptionCheckAndClear(JNIEnv* env); + +StatusOr<std::string> ToStlString(JNIEnv* env, const jstring& str); + +// A deleter to be used with std::unique_ptr to delete JNI global references. +class GlobalRefDeleter { + public: + explicit GlobalRefDeleter(JavaVM* jvm) : jvm_(jvm) {} + + GlobalRefDeleter(const GlobalRefDeleter& orig) = default; + + // Copy assignment to allow move semantics in ScopedGlobalRef. + GlobalRefDeleter& operator=(const GlobalRefDeleter& rhs) { + TC3_CHECK_EQ(jvm_, rhs.jvm_); + return *this; + } + + // The delete operator. + void operator()(jobject object) const { + JNIEnv* env; + if (object != nullptr && jvm_ != nullptr && + JNI_OK == + jvm_->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_4)) { + env->DeleteGlobalRef(object); + } + } + + private: + // The jvm_ stashed to use for deletion. + JavaVM* const jvm_; +}; + +// A deleter to be used with std::unique_ptr to delete JNI local references. +class LocalRefDeleter { + public: + explicit LocalRefDeleter(JNIEnv* env) + : env_(env) {} // NOLINT(runtime/explicit) + + LocalRefDeleter(const LocalRefDeleter& orig) = default; + + // Copy assignment to allow move semantics in ScopedLocalRef. + LocalRefDeleter& operator=(const LocalRefDeleter& rhs) { + env_ = rhs.env_; + return *this; + } + + // The delete operator. + void operator()(jobject object) const { + if (env_) { + env_->DeleteLocalRef(object); + } + } + + private: + // The env_ stashed to use for deletion. Thread-local, don't share! + JNIEnv* env_; +}; + +// A smart pointer that deletes a reference when it goes out of scope. +// +// Note that this class is not thread-safe since it caches JNIEnv in +// the deleter. Do not use the same jobject across different threads. +template <typename T, typename Env, typename Deleter> +class ScopedRef { + public: + ScopedRef() : ptr_(nullptr, Deleter(nullptr)) {} + ScopedRef(T value, Env* env) : ptr_(value, Deleter(env)) {} + + T get() const { return ptr_.get(); } + + T release() { return ptr_.release(); } + + bool operator!() const { return !ptr_; } + + bool operator==(void* value) const { return ptr_.get() == value; } + + explicit operator bool() const { return ptr_ != nullptr; } + + void reset(T value, Env* env) { + ptr_.reset(value); + ptr_.get_deleter() = Deleter(env); + } + + private: + std::unique_ptr<typename std::remove_pointer<T>::type, Deleter> ptr_; +}; + +template <typename T, typename U, typename Env, typename Deleter> +inline bool operator==(const ScopedRef<T, Env, Deleter>& x, + const ScopedRef<U, Env, Deleter>& y) { + return x.get() == y.get(); +} + +template <typename T, typename Env, typename Deleter> +inline bool operator==(const ScopedRef<T, Env, Deleter>& x, std::nullptr_t) { + return x.get() == nullptr; +} + +template <typename T, typename Env, typename Deleter> +inline bool operator==(std::nullptr_t, const ScopedRef<T, Env, Deleter>& x) { + return nullptr == x.get(); +} + +template <typename T, typename U, typename Env, typename Deleter> +inline bool operator!=(const ScopedRef<T, Env, Deleter>& x, + const ScopedRef<U, Env, Deleter>& y) { + return x.get() != y.get(); +} + +template <typename T, typename Env, typename Deleter> +inline bool operator!=(const ScopedRef<T, Env, Deleter>& x, std::nullptr_t) { + return x.get() != nullptr; +} + +template <typename T, typename Env, typename Deleter> +inline bool operator!=(std::nullptr_t, const ScopedRef<T, Env, Deleter>& x) { + return nullptr != x.get(); +} + +template <typename T, typename U, typename Env, typename Deleter> +inline bool operator<(const ScopedRef<T, Env, Deleter>& x, + const ScopedRef<U, Env, Deleter>& y) { + return x.get() < y.get(); +} + +template <typename T, typename U, typename Env, typename Deleter> +inline bool operator>(const ScopedRef<T, Env, Deleter>& x, + const ScopedRef<U, Env, Deleter>& y) { + return x.get() > y.get(); +} + +// A smart pointer that deletes a JNI global reference when it goes out +// of scope. Usage is: +// ScopedGlobalRef<jobject> scoped_global(env->JniFunction(), jvm); +template <typename T> +using ScopedGlobalRef = ScopedRef<T, JavaVM, GlobalRefDeleter>; + +// Ditto, but usage is: +// ScopedLocalRef<jobject> scoped_local(env->JniFunction(), env); +template <typename T> +using ScopedLocalRef = ScopedRef<T, JNIEnv, LocalRefDeleter>; + +// A helper to create global references. +template <typename T> +ScopedGlobalRef<T> MakeGlobalRef(T object, JNIEnv* env, JavaVM* jvm) { + const jobject global_object = env->NewGlobalRef(object); + return ScopedGlobalRef<T>(reinterpret_cast<T>(global_object), jvm); +} + +} // namespace libtextclassifier3 + +#endif // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_BASE_H_ diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.cc b/icing/text_classifier/lib3/utils/java/jni-helper.cc new file mode 100644 index 0000000..0d9b0a0 --- /dev/null +++ b/icing/text_classifier/lib3/utils/java/jni-helper.cc @@ -0,0 +1,175 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/text_classifier/lib3/utils/java/jni-helper.h" + +namespace libtextclassifier3 { + +StatusOr<ScopedLocalRef<jclass>> JniHelper::FindClass(JNIEnv* env, + const char* class_name) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + ScopedLocalRef<jclass> result(env->FindClass(class_name), env); + TC3_NO_EXCEPTION_OR_RETURN; + TC3_NOT_NULL_OR_RETURN; + return result; +} + +StatusOr<jmethodID> JniHelper::GetMethodID(JNIEnv* env, jclass clazz, + const char* method_name, + const char* return_type) { + jmethodID result = env->GetMethodID(clazz, method_name, return_type); + TC3_NO_EXCEPTION_OR_RETURN; + TC3_NOT_NULL_OR_RETURN; + return result; +} + +StatusOr<ScopedLocalRef<jobject>> JniHelper::GetStaticObjectField( + JNIEnv* env, jclass class_name, jfieldID field_id) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + ScopedLocalRef<jobject> result( + env->GetStaticObjectField(class_name, field_id), env); + TC3_NO_EXCEPTION_OR_RETURN; + TC3_NOT_NULL_OR_RETURN; + return result; +} + +StatusOr<ScopedLocalRef<jbyteArray>> JniHelper::NewByteArray(JNIEnv* env, + jsize length) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + ScopedLocalRef<jbyteArray> result(env->NewByteArray(length), env); + TC3_NO_EXCEPTION_OR_RETURN; + TC3_NOT_NULL_OR_RETURN; + return result; +} + +Status JniHelper::CallVoidMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...) { + va_list args; + va_start(args, method_id); + env->CallVoidMethodV(object, method_id, args); + va_end(args); + + TC3_NO_EXCEPTION_OR_RETURN; + return Status::OK; +} + +StatusOr<bool> JniHelper::CallBooleanMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...) { + va_list args; + va_start(args, method_id); + bool result = env->CallBooleanMethodV(object, method_id, args); + va_end(args); + + TC3_NO_EXCEPTION_OR_RETURN; + return result; +} + +StatusOr<int32> JniHelper::CallIntMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...) { + va_list args; + va_start(args, method_id); + jint result = env->CallIntMethodV(object, method_id, args); + va_end(args); + + TC3_NO_EXCEPTION_OR_RETURN; + return result; +} + +StatusOr<int64> JniHelper::CallLongMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...) { + va_list args; + va_start(args, method_id); + jlong result = env->CallLongMethodV(object, method_id, args); + va_end(args); + + TC3_NO_EXCEPTION_OR_RETURN; + return result; +} + +StatusOr<float> JniHelper::CallFloatMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...) { + va_list args; + va_start(args, method_id); + jfloat result = env->CallFloatMethodV(object, method_id, args); + va_end(args); + + TC3_NO_EXCEPTION_OR_RETURN; + return result; +} + +StatusOr<double> JniHelper::CallDoubleMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...) { + va_list args; + va_start(args, method_id); + jdouble result = env->CallDoubleMethodV(object, method_id, args); + va_end(args); + + TC3_NO_EXCEPTION_OR_RETURN; + return result; +} + +StatusOr<ScopedLocalRef<jintArray>> JniHelper::NewIntArray(JNIEnv* env, + jsize length) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + ScopedLocalRef<jintArray> result(env->NewIntArray(length), env); + TC3_NO_EXCEPTION_OR_RETURN; + TC3_NOT_NULL_OR_RETURN; + return result; +} + +StatusOr<ScopedLocalRef<jfloatArray>> JniHelper::NewFloatArray(JNIEnv* env, + jsize length) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + ScopedLocalRef<jfloatArray> result(env->NewFloatArray(length), env); + TC3_NO_EXCEPTION_OR_RETURN; + TC3_NOT_NULL_OR_RETURN; + return result; +} + +Status JniHelper::SetObjectArrayElement(JNIEnv* env, jobjectArray array, + jsize index, jobject val) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + env->SetObjectArrayElement(array, index, val); + TC3_NO_EXCEPTION_OR_RETURN; + return Status::OK; +} + +StatusOr<ScopedLocalRef<jobjectArray>> JniHelper::NewObjectArray( + JNIEnv* env, jsize length, jclass element_class, jobject initial_element) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + ScopedLocalRef<jobjectArray> result( + env->NewObjectArray(length, element_class, initial_element), env); + TC3_NO_EXCEPTION_OR_RETURN; + TC3_NOT_NULL_OR_RETURN; + return result; +} + +StatusOr<jsize> JniHelper::GetArrayLength(JNIEnv* env, + jarray jinput_fragments) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + jsize result = env->GetArrayLength(jinput_fragments); + TC3_NO_EXCEPTION_OR_RETURN; + return result; +} + +StatusOr<ScopedLocalRef<jstring>> JniHelper::NewStringUTF(JNIEnv* env, + const char* bytes) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + ScopedLocalRef<jstring> result(env->NewStringUTF(bytes), env); + TC3_NO_EXCEPTION_OR_RETURN; + TC3_NOT_NULL_OR_RETURN; + return result; +} + +} // namespace libtextclassifier3 diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.h b/icing/text_classifier/lib3/utils/java/jni-helper.h new file mode 100644 index 0000000..ea4ba3b --- /dev/null +++ b/icing/text_classifier/lib3/utils/java/jni-helper.h @@ -0,0 +1,156 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Utility class that provides similar calls like JNIEnv, but performs +// additional checks on them, so that it's harder to use them incorrectly. + +#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_ +#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_ + +#include <jni.h> + +#include <string> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/text_classifier/lib3/utils/java/jni-base.h" + +#define TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN \ + if (!EnsureLocalCapacity(env, 1)) { \ + TC3_LOG(ERROR) << "EnsureLocalCapacity(1) failed."; \ + return {Status::UNKNOWN}; \ + } + +#define TC3_NO_EXCEPTION_OR_RETURN \ + if (JniExceptionCheckAndClear(env)) { \ + return {Status::UNKNOWN}; \ + } + +#define TC3_NOT_NULL_OR_RETURN \ + if (result == nullptr) { \ + return {Status::UNKNOWN}; \ + } + +#define TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD( \ + METHOD_NAME, RETURN_TYPE, INPUT_TYPE, POST_CHECK) \ + template <typename T = RETURN_TYPE> \ + static StatusOr<ScopedLocalRef<T>> METHOD_NAME( \ + JNIEnv* env, INPUT_TYPE object, jmethodID method_id, ...) { \ + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; \ + \ + va_list args; \ + va_start(args, method_id); \ + ScopedLocalRef<T> result( \ + reinterpret_cast<T>(env->METHOD_NAME##V(object, method_id, args)), \ + env); \ + POST_CHECK \ + va_end(args); \ + \ + TC3_NO_EXCEPTION_OR_RETURN; \ + return result; \ + } + +#define TC3_JNI_NO_CHECK \ + {} + +namespace libtextclassifier3 { + +class JniHelper { + public: + // Misc methods. + static StatusOr<ScopedLocalRef<jclass>> FindClass(JNIEnv* env, + const char* class_name); + + template <typename T = jobject> + static StatusOr<ScopedLocalRef<T>> GetObjectArrayElement(JNIEnv* env, + jobjectArray array, + jsize index); + static StatusOr<jmethodID> GetMethodID(JNIEnv* env, jclass clazz, + const char* method_name, + const char* return_type); + + static StatusOr<ScopedLocalRef<jobject>> GetStaticObjectField( + JNIEnv* env, jclass class_name, jfieldID field_id); + + // New* methods. + TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(NewObject, jobject, jclass, + TC3_NOT_NULL_OR_RETURN); + static StatusOr<ScopedLocalRef<jobjectArray>> NewObjectArray( + JNIEnv* env, jsize length, jclass element_class, + jobject initial_element = nullptr); + static StatusOr<ScopedLocalRef<jbyteArray>> NewByteArray(JNIEnv* env, + jsize length); + static StatusOr<ScopedLocalRef<jintArray>> NewIntArray(JNIEnv* env, + jsize length); + static StatusOr<ScopedLocalRef<jstring>> NewStringUTF(JNIEnv* env, + const char* bytes); + static StatusOr<ScopedLocalRef<jfloatArray>> NewFloatArray(JNIEnv* env, + jsize length); + + static StatusOr<jsize> GetArrayLength(JNIEnv* env, jarray jinput_fragments); + + static Status SetObjectArrayElement(JNIEnv* env, jobjectArray array, + jsize index, jobject val); + + // Call* methods. + TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(CallObjectMethod, jobject, + jobject, TC3_JNI_NO_CHECK); + TC3_DEFINE_VARIADIC_SCOPED_LOCAL_REF_ENV_METHOD(CallStaticObjectMethod, + jobject, jclass, + TC3_JNI_NO_CHECK); + static Status CallVoidMethod(JNIEnv* env, jobject object, jmethodID method_id, + ...); + static StatusOr<bool> CallBooleanMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...); + static StatusOr<int32> CallIntMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...); + static StatusOr<int64> CallLongMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...); + static StatusOr<float> CallFloatMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...); + static StatusOr<double> CallDoubleMethod(JNIEnv* env, jobject object, + jmethodID method_id, ...); + + template <class T> + static StatusOr<T> CallStaticIntMethod(JNIEnv* env, jclass clazz, + jmethodID method_id, ...); +}; + +template <typename T> +StatusOr<ScopedLocalRef<T>> JniHelper::GetObjectArrayElement(JNIEnv* env, + jobjectArray array, + jsize index) { + TC3_ENSURE_LOCAL_CAPACITY_OR_RETURN; + ScopedLocalRef<T> result( + reinterpret_cast<T>(env->GetObjectArrayElement(array, index)), env); + + TC3_NO_EXCEPTION_OR_RETURN; + return result; +} + +template <class T> +StatusOr<T> JniHelper::CallStaticIntMethod(JNIEnv* env, jclass clazz, + jmethodID method_id, ...) { + va_list args; + va_start(args, method_id); + jint result = env->CallStaticIntMethodV(clazz, method_id, args); + va_end(args); + + TC3_NO_EXCEPTION_OR_RETURN; + return result; +} + +} // namespace libtextclassifier3 + +#endif // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_JNI_HELPER_H_ diff --git a/icing/text_classifier/lib3/utils/java/string_utils.cc b/icing/text_classifier/lib3/utils/java/string_utils.cc new file mode 100644 index 0000000..2384ba4 --- /dev/null +++ b/icing/text_classifier/lib3/utils/java/string_utils.cc @@ -0,0 +1,73 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/text_classifier/lib3/utils/java/string_utils.h" + +#include "icing/text_classifier/lib3/utils/base/logging.h" + +namespace libtextclassifier3 { + +bool JByteArrayToString(JNIEnv* env, const jbyteArray& array, + std::string* result) { + jbyte* const array_bytes = env->GetByteArrayElements(array, JNI_FALSE); + if (array_bytes == nullptr) { + return false; + } + + const int array_length = env->GetArrayLength(array); + *result = std::string(reinterpret_cast<char*>(array_bytes), array_length); + + env->ReleaseByteArrayElements(array, array_bytes, JNI_ABORT); + + return true; +} + +bool JStringToUtf8String(JNIEnv* env, const jstring& jstr, + std::string* result) { + if (jstr == nullptr) { + *result = std::string(); + return true; + } + + jclass string_class = env->FindClass("java/lang/String"); + if (!string_class) { + TC3_LOG(ERROR) << "Can't find String class"; + return false; + } + + jmethodID get_bytes_id = + env->GetMethodID(string_class, "getBytes", "(Ljava/lang/String;)[B"); + + jstring encoding = env->NewStringUTF("UTF-8"); + + jbyteArray array = reinterpret_cast<jbyteArray>( + env->CallObjectMethod(jstr, get_bytes_id, encoding)); + + JByteArrayToString(env, array, result); + + // Release the array. + env->DeleteLocalRef(array); + env->DeleteLocalRef(string_class); + env->DeleteLocalRef(encoding); + + return true; +} + +ScopedStringChars GetScopedStringChars(JNIEnv* env, jstring string, + jboolean* is_copy) { + return ScopedStringChars(env->GetStringUTFChars(string, is_copy), + StringCharsReleaser(env, string)); +} + +} // namespace libtextclassifier3 diff --git a/icing/text_classifier/lib3/utils/java/string_utils.h b/icing/text_classifier/lib3/utils/java/string_utils.h new file mode 100644 index 0000000..dddef57 --- /dev/null +++ b/icing/text_classifier/lib3/utils/java/string_utils.h @@ -0,0 +1,74 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_ +#define ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_ + +#include <jni.h> +#include <memory> +#include <string> + +#include "icing/text_classifier/lib3/utils/base/logging.h" + +namespace libtextclassifier3 { + +bool JByteArrayToString(JNIEnv* env, const jbyteArray& array, + std::string* result); +bool JStringToUtf8String(JNIEnv* env, const jstring& jstr, std::string* result); + +// A deleter to be used with std::unique_ptr to release Java string chars. +class StringCharsReleaser { + public: + StringCharsReleaser() : env_(nullptr) {} + + StringCharsReleaser(JNIEnv* env, jstring jstr) : env_(env), jstr_(jstr) {} + + StringCharsReleaser(const StringCharsReleaser& orig) = default; + + // Copy assignment to allow move semantics in StringCharsReleaser. + StringCharsReleaser& operator=(const StringCharsReleaser& rhs) { + // As the releaser and its state are thread-local, it's enough to only + // ensure the envs are consistent but do nothing. + TC3_CHECK_EQ(env_, rhs.env_); + return *this; + } + + // The delete operator. + void operator()(const char* chars) const { + if (env_ != nullptr) { + env_->ReleaseStringUTFChars(jstr_, chars); + } + } + + private: + // The env_ stashed to use for deletion. Thread-local, don't share! + JNIEnv* const env_; + + // The referenced jstring. + jstring jstr_; +}; + +// A smart pointer that releases string chars when it goes out of scope. +// of scope. +// Note that this class is not thread-safe since it caches JNIEnv in +// the deleter. Do not use the same jobject across different threads. +using ScopedStringChars = std::unique_ptr<const char, StringCharsReleaser>; + +// Returns a scoped pointer to the array of Unicode characters of a string. +ScopedStringChars GetScopedStringChars(JNIEnv* env, jstring string, + jboolean* is_copy = nullptr); + +} // namespace libtextclassifier3 + +#endif // ICING_TEXT_CLASSIFIER_LIB3_UTILS_JAVA_STRING_UTILS_H_ diff --git a/icing/tokenization/icu-language-segmenter_test.cc b/icing/tokenization/icu-language-segmenter_test.cc deleted file mode 100644 index fd4755a..0000000 --- a/icing/tokenization/icu-language-segmenter_test.cc +++ /dev/null @@ -1,374 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/absl_ports/str_cat.h" -#include "icing/icu-data-file-helper.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/icu-i18n-test-utils.h" -#include "icing/testing/test-data.h" -#include "icing/tokenization/language-segmenter-factory.h" -#include "icing/tokenization/language-segmenter.h" -#include "unicode/uloc.h" - -namespace icing { -namespace lib { -namespace { -using ::testing::ElementsAre; -using ::testing::Eq; -using ::testing::IsEmpty; - -class IcuLanguageSegmenterAllLocalesTest - : public testing::TestWithParam<const char*> { - protected: - void SetUp() override { - ICING_ASSERT_OK( - // File generated via icu_data_file rule in //icing/BUILD. - icu_data_file_helper::SetUpICUDataFile( - GetTestFilePath("icing/icu.dat"))); - } - - static std::string GetLocale() { return GetParam(); } -}; - -TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty())); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"), - IsOkAndHolds(ElementsAre("Hello", " ", "World"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - // ASCII punctuation marks are kept - EXPECT_THAT( - language_segmenter->GetAllTerms("Hello, World!!!"), - IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!"))); - EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"), - IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project"))); - EXPECT_THAT(language_segmenter->GetAllTerms("100%"), - IsOkAndHolds(ElementsAre("100", "%"))); - EXPECT_THAT(language_segmenter->GetAllTerms("A&B"), - IsOkAndHolds(ElementsAre("A", "&", "B"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - // ASCII special characters are kept - EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"), - IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000"))); - EXPECT_THAT(language_segmenter->GetAllTerms("A+B"), - IsOkAndHolds(ElementsAre("A", "+", "B"))); - // 0x0009 is the unicode for tab (within ASCII range). - std::string text_with_tab = absl_ports::StrCat( - "Hello", UCharToString(0x0009), UCharToString(0x0009), "World"); - EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab), - IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009), - UCharToString(0x0009), "World"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - // Full-width (non-ASCII) punctuation marks and special characters are left - // out. - EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"), - IsOkAndHolds(ElementsAre("Hello"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"), - IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank"))); - EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."), - IsOkAndHolds(ElementsAre("I.B.M", "."))); - EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"), - IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M"))); - EXPECT_THAT(language_segmenter->GetAllTerms("I B M"), - IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - // According to unicode word break rules - // WB6(https://unicode.org/reports/tr29/#WB6), - // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some - // punctuation characters are used as word connecters. That is, words don't - // break before and after them. Here we just test some that we care about. - - // Word connecters - EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"), - IsOkAndHolds(ElementsAre("com.google.android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"), - IsOkAndHolds(ElementsAre("com:google:android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"), - IsOkAndHolds(ElementsAre("com'google'android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"), - IsOkAndHolds(ElementsAre("com_google_android"))); - - // Word connecters can be mixed - EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"), - IsOkAndHolds(ElementsAre("com.google.android:icing"))); - - // Any heading and trailing characters are not connecters - EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."), - IsOkAndHolds(ElementsAre(".", "com.google.android", "."))); - - // Not word connecters - EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"), - IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"), - IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"), - IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"), - IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"), - IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"), - IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"), - IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"), - IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"), - IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android"))); - EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"), - IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android"))); - EXPECT_THAT( - language_segmenter->GetAllTerms("com\"google\"android"), - IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."), - IsOkAndHolds(ElementsAre("It's", " ", "ok", "."))); - EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."), - IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", "."))); - EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."), - IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", "."))); - EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"), - IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone"))); - // 0x2019 is the single right quote, should be treated the same as "'" - std::string token_with_quote = - absl_ports::StrCat("He", UCharToString(0x2019), "ll"); - std::string text_with_quote = - absl_ports::StrCat(token_with_quote, " be back."); - EXPECT_THAT( - language_segmenter->GetAllTerms(text_with_quote), - IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", "."))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - - EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"), - IsOkAndHolds(ElementsAre("(", "Hello", ")"))); - - EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("), - IsOkAndHolds(ElementsAre(")", "Hello", "("))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - - EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""), - IsOkAndHolds(ElementsAre("\"", "Hello", "\""))); - - EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"), - IsOkAndHolds(ElementsAre("'", "Hello", "'"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - - // Alphanumeric terms are allowed - EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"), - IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - - // Alphanumeric terms are allowed - EXPECT_THAT( - language_segmenter->GetAllTerms("3.141592653589793238462643383279"), - IsOkAndHolds(ElementsAre("3.141592653589793238462643383279"))); - - EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"), - IsOkAndHolds(ElementsAre("3,456.789"))); - - EXPECT_THAT(language_segmenter->GetAllTerms("-123"), - IsOkAndHolds(ElementsAre("-", "123"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - // Multiple continuous whitespaces are treated as one. - const int kNumSeparators = 256; - const std::string text_with_spaces = - absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World"); - EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces), - IsOkAndHolds(ElementsAre("Hello", " ", "World"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't - // have whitespaces as word delimiter. - - // Chinese - EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"), - IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班"))); - // Japanese - EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"), - IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩", - "い", "てい", "ます"))); - // Khmer - EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), - IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ"))); - // Thai - EXPECT_THAT( - language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"), - IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"), - IsOkAndHolds(ElementsAre("āăąḃḅḇčćç"))); -} - -// TODO(samzheng): test cases for more languages (e.g. top 20 in the world) -TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - // Turkish - EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"), - IsOkAndHolds(ElementsAre("merhaba", " ", "dünya"))); - // Korean - EXPECT_THAT( - language_segmenter->GetAllTerms("나는 매일 출근합니다."), - IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", "."))); -} - -// TODO(samzheng): more mixed languages test cases -TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"), - IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好", - "吗", "お", "元気", "です", "か"))); - - EXPECT_THAT( - language_segmenter->GetAllTerms("나는 California에 산다"), - IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다"))); -} - -TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C, - GetLocale())); - // Validates that the input strings are not copied - const std::string text = "Hello World"; - const char* word1_address = text.c_str(); - const char* word2_address = text.c_str() + 6; - ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms, - language_segmenter->GetAllTerms(text)); - ASSERT_THAT(terms, ElementsAre("Hello", " ", "World")); - const char* word1_result_address = terms.at(0).data(); - const char* word2_result_address = terms.at(2).data(); - - // The underlying char* should be the same - EXPECT_THAT(word1_address, Eq(word1_result_address)); - EXPECT_THAT(word2_address, Eq(word2_result_address)); -} - -INSTANTIATE_TEST_SUITE_P( - LocaleName, IcuLanguageSegmenterAllLocalesTest, - testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH, - ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN, - ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE, - ULOC_TRADITIONAL_CHINESE, - "es_ES", // Spanish - "hi_IN", // Hindi - "th_TH", // Thai - "lo_LA", // Lao - "km_KH", // Khmer - "ar_DZ", // Arabic - "ru_RU", // Russian - "pt_PT", // Portuguese - "en_US_POSIX" // American English (Computer) - "wrong_locale" // Will fall back to ICU default locale - "" // Will fall back to ICU default locale - )); - -} // namespace -} // namespace lib -} // namespace icing diff --git a/icing/tokenization/language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc index 92d06fe..0ef1824 100644 --- a/icing/tokenization/language-segmenter-factory.cc +++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc @@ -12,10 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "icing/tokenization/icu/icu-language-segmenter.h" #include "icing/tokenization/language-segmenter-factory.h" - -#include "icing/tokenization/icu-language-segmenter.h" -#include "icing/tokenization/space-language-segmenter.h" #include "icing/util/logging.h" namespace icing { @@ -37,23 +35,18 @@ constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX"; // users. Right now illegal locale strings will be ignored by ICU. ICU // components will be created with its default locale. libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create( - SegmenterType type, std::string locale) { + SegmenterOptions options) { // Word connector rules for "en_US_POSIX" (American English (Computer)) are // different from other locales. E.g. "email.subject" will be split into 3 // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one // term in other locales. Our current LanguageSegmenter doesn't handle this // special rule, so we replace it with "en_US". - if (locale == kLocaleAmericanEnglishComputer) { + if (options.locale == kLocaleAmericanEnglishComputer) { ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer << " not supported. Converting to locale " << ULOC_US; - locale = ULOC_US; - } - switch (type) { - case ICU4C: - return std::make_unique<IcuLanguageSegmenter>(std::move(locale)); - case SPACE: - return std::make_unique<SpaceLanguageSegmenter>(); + options.locale = ULOC_US; } + return std::make_unique<IcuLanguageSegmenter>(std::move(options.locale)); } } // namespace language_segmenter_factory diff --git a/icing/tokenization/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index 8d6aa76..d43a78d 100644 --- a/icing/tokenization/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/tokenization/icu-language-segmenter.h" +#include "icing/tokenization/icu/icu-language-segmenter.h" #include <cstdint> #include <memory> @@ -24,7 +24,8 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" -#include "icing/util/icu-i18n-utils.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" #include "unicode/ubrk.h" #include "unicode/uchar.h" @@ -61,7 +62,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { } // Advances to the next term. Returns false if it has reached the end. - bool Advance() { + bool Advance() override { // Prerequisite check if (term_end_index_exclusive_ == UBRK_DONE) { return false; @@ -77,52 +78,66 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Reached the end if (term_end_index_exclusive_ == UBRK_DONE) { + MarkAsDone(); return false; } - // Rule 1: all ASCII terms will be returned. - // We know it's a ASCII term by checking the first char. - if (icu_i18n_utils::IsAscii(text_[term_start_index_])) { - return true; - } - - UChar32 uchar32 = icu_i18n_utils::GetUChar32At(text_.data(), text_.length(), - term_start_index_); - // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. - // We know it's an alphabetic term by checking the first unicode character. - if (u_isUAlphabetic(uchar32)) { - return true; - } else { + if (!IsValidSegment()) { return Advance(); } + return true; } // Returns the current term. It can be called only when Advance() returns // true. - std::string_view GetTerm() const { - if (text_[term_start_index_] == kASCIISpace) { + std::string_view GetTerm() const override { + int term_length = term_end_index_exclusive_ - term_start_index_; + if (term_end_index_exclusive_ == UBRK_DONE) { + term_length = 0; + } else if (text_[term_start_index_] == kASCIISpace) { // Rule 3: multiple continuous whitespaces are treated as one. - return std::string_view(&text_[term_start_index_], 1); + term_length = 1; } - return text_.substr(term_start_index_, - term_end_index_exclusive_ - term_start_index_); + return text_.substr(term_start_index_, term_length); } libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter( - int32_t offset) { + int32_t offset) override { + if (offset < 0 || offset >= text_.length()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Illegal offset provided! Offset %d is not within bounds of string " + "of length %zu", + offset, text_.length())); + } term_start_index_ = ubrk_following(break_iterator_, offset); if (term_start_index_ == UBRK_DONE) { - return absl_ports::NotFoundError(""); + MarkAsDone(); + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "No segments begin after provided offset %d.", offset)); } term_end_index_exclusive_ = ubrk_next(break_iterator_); if (term_end_index_exclusive_ == UBRK_DONE) { - return absl_ports::NotFoundError(""); + MarkAsDone(); + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "No segments begin after provided offset %d.", offset)); + } + if (!IsValidSegment()) { + if (!Advance()) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "No segments begin after provided offset %d.", offset)); + } } return term_start_index_; } libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore( - int32_t offset) { + int32_t offset) override { + if (offset < 0 || offset >= text_.length()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Illegal offset provided! Offset %d is not within bounds of string " + "of length %zu", + offset, text_.length())); + } ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset)); if (term_end_index_exclusive_ > offset) { // This term ends after offset. So we need to get the term just before @@ -132,6 +147,15 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return term_start_index_; } + libtextclassifier3::StatusOr<int32_t> ResetToStart() override { + term_start_index_ = 0; + term_end_index_exclusive_ = 0; + if (!Advance()) { + return absl_ports::NotFoundError(""); + } + return term_start_index_; + } + private: explicit IcuLanguageSegmenterIterator(std::string_view text, std::string_view locale) @@ -155,15 +179,43 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { libtextclassifier3::Status ResetToTermStartingBefore(int32_t offset) { term_start_index_ = ubrk_preceding(break_iterator_, offset); if (term_start_index_ == UBRK_DONE) { + MarkAsDone(); return absl_ports::NotFoundError(""); } term_end_index_exclusive_ = ubrk_next(break_iterator_); if (term_end_index_exclusive_ == UBRK_DONE) { + MarkAsDone(); return absl_ports::NotFoundError(""); } return libtextclassifier3::Status::OK; } + // Ensures that all members are consistent with the 'Done' state. + // In the 'Done' state, term_start_index_ will point to the first character + // and term_end_index_exclusive_ will be marked with the kDone value. + // break_iterator_ may be in any state. + void MarkAsDone() { + term_end_index_exclusive_ = UBRK_DONE; + term_start_index_ = 0; + } + + bool IsValidSegment() const { + // Rule 1: all ASCII terms will be returned. + // We know it's a ASCII term by checking the first char. + if (i18n_utils::IsAscii(text_[term_start_index_])) { + return true; + } + + UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), + term_start_index_); + // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. + // We know it's an alphabetic term by checking the first unicode character. + if (u_isUAlphabetic(uchar32)) { + return true; + } + return false; + } + // The underlying class that does the segmentation, ubrk_close() must be // called after using. UBreakIterator* break_iterator_; diff --git a/icing/tokenization/icu-language-segmenter.h b/icing/tokenization/icu/icu-language-segmenter.h index b3d1acc..4115461 100644 --- a/icing/tokenization/icu-language-segmenter.h +++ b/icing/tokenization/icu/icu-language-segmenter.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_ -#define ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_ +#ifndef ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ +#define ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ #include <cstdint> #include <memory> @@ -76,4 +76,4 @@ class IcuLanguageSegmenter : public LanguageSegmenter { } // namespace lib } // namespace icing -#endif // ICING_TOKENIZATION_ICU_LANGUAGE_SEGMENTER_H_ +#endif // ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc new file mode 100644 index 0000000..31c2726 --- /dev/null +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -0,0 +1,1016 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/helpers/icu/icu-data-file-helper.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/icu-i18n-test-utils.h" +#include "icing/testing/test-data.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" +#include "unicode/uloc.h" + +namespace icing { +namespace lib { +namespace { +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; + +// Returns a vector containing all terms retrieved by Advancing on the iterator. +std::vector<std::string_view> GetAllTermsAdvance( + LanguageSegmenter::Iterator* itr) { + std::vector<std::string_view> terms; + while (itr->Advance()) { + terms.push_back(itr->GetTerm()); + } + return terms; +} + +// Returns a vector containing all terms retrieved by calling +// ResetToStart/ResetAfter with the current position to simulate Advancing on +// the iterator. +std::vector<std::string_view> GetAllTermsResetAfter( + LanguageSegmenter::Iterator* itr) { + std::vector<std::string_view> terms; + if (!itr->ResetToStart().ok()) { + return terms; + } + terms.push_back(itr->GetTerm()); + const char* text_begin = itr->GetTerm().data(); + // Calling ResetToTermStartingAfter with the current position should get the + // very next term in the sequence. + for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok(); + current_pos = itr->GetTerm().data() - text_begin) { + terms.push_back(itr->GetTerm()); + } + return terms; +} + +// Returns a vector containing all terms retrieved by alternating calls to +// Advance and calls to ResetAfter with the current position to simulate +// Advancing. +std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter( + LanguageSegmenter::Iterator* itr) { + const char* text_begin = itr->GetTerm().data(); + std::vector<std::string_view> terms; + + bool is_ok = true; + int current_pos = 0; + while (is_ok) { + // Alternate between using Advance and ResetToTermAfter. + if (terms.size() % 2 == 0) { + is_ok = itr->Advance(); + } else { + // Calling ResetToTermStartingAfter with the current position should get + // the very next term in the sequence. + current_pos = itr->GetTerm().data() - text_begin; + is_ok = itr->ResetToTermStartingAfter(current_pos).ok(); + } + if (is_ok) { + terms.push_back(itr->GetTerm()); + } + } + return terms; +} + +// Returns a vector containing all terms retrieved by calling ResetBefore with +// the current position, starting at the end of the text. This vector should be +// in reverse order of GetAllTerms and missing the last term. +std::vector<std::string_view> GetAllTermsResetBefore( + LanguageSegmenter::Iterator* itr) { + const char* text_begin = itr->GetTerm().data(); + int last_pos = 0; + while (itr->Advance()) { + last_pos = itr->GetTerm().data() - text_begin; + } + std::vector<std::string_view> terms; + // Calling ResetToTermEndingBefore with the current position should get the + // previous term in the sequence. + for (int current_pos = last_pos; + itr->ResetToTermEndingBefore(current_pos).ok(); + current_pos = itr->GetTerm().data() - text_begin) { + terms.push_back(itr->GetTerm()); + } + return terms; +} + +class IcuLanguageSegmenterAllLocalesTest + : public testing::TestWithParam<const char*> { + protected: + void SetUp() override { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + static std::string GetLocale() { return GetParam(); } + static language_segmenter_factory::SegmenterOptions GetOptions() { + return language_segmenter_factory::SegmenterOptions(GetLocale()); + } +}; + +TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty())); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"), + IsOkAndHolds(ElementsAre("Hello", " ", "World"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // ASCII punctuation marks are kept + EXPECT_THAT( + language_segmenter->GetAllTerms("Hello, World!!!"), + IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!"))); + EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"), + IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project"))); + EXPECT_THAT(language_segmenter->GetAllTerms("100%"), + IsOkAndHolds(ElementsAre("100", "%"))); + EXPECT_THAT(language_segmenter->GetAllTerms("A&B"), + IsOkAndHolds(ElementsAre("A", "&", "B"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // ASCII special characters are kept + EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"), + IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000"))); + EXPECT_THAT(language_segmenter->GetAllTerms("A+B"), + IsOkAndHolds(ElementsAre("A", "+", "B"))); + // 0x0009 is the unicode for tab (within ASCII range). + std::string text_with_tab = absl_ports::StrCat( + "Hello", UCharToString(0x0009), UCharToString(0x0009), "World"); + EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab), + IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009), + UCharToString(0x0009), "World"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Full-width (non-ASCII) punctuation marks and special characters are left + // out. + EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"), + IsOkAndHolds(ElementsAre("Hello"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"), + IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank"))); + EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."), + IsOkAndHolds(ElementsAre("I.B.M", "."))); + EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"), + IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M"))); + EXPECT_THAT(language_segmenter->GetAllTerms("I B M"), + IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // According to unicode word break rules + // WB6(https://unicode.org/reports/tr29/#WB6), + // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some + // punctuation characters are used as word connecters. That is, words don't + // break before and after them. Here we just test some that we care about. + + // Word connecters + EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"), + IsOkAndHolds(ElementsAre("com.google.android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"), + IsOkAndHolds(ElementsAre("com:google:android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"), + IsOkAndHolds(ElementsAre("com'google'android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"), + IsOkAndHolds(ElementsAre("com_google_android"))); + + // Word connecters can be mixed + EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"), + IsOkAndHolds(ElementsAre("com.google.android:icing"))); + + // Any heading and trailing characters are not connecters + EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."), + IsOkAndHolds(ElementsAre(".", "com.google.android", "."))); + + // Not word connecters + EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"), + IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"), + IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"), + IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"), + IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"), + IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"), + IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"), + IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"), + IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"), + IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"), + IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android"))); + EXPECT_THAT( + language_segmenter->GetAllTerms("com\"google\"android"), + IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."), + IsOkAndHolds(ElementsAre("It's", " ", "ok", "."))); + EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."), + IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", "."))); + EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."), + IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", "."))); + EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"), + IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone"))); + // 0x2019 is the single right quote, should be treated the same as "'" + std::string token_with_quote = + absl_ports::StrCat("He", UCharToString(0x2019), "ll"); + std::string text_with_quote = + absl_ports::StrCat(token_with_quote, " be back."); + EXPECT_THAT( + language_segmenter->GetAllTerms(text_with_quote), + IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", "."))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + + EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"), + IsOkAndHolds(ElementsAre("(", "Hello", ")"))); + + EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("), + IsOkAndHolds(ElementsAre(")", "Hello", "("))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + + EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""), + IsOkAndHolds(ElementsAre("\"", "Hello", "\""))); + + EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"), + IsOkAndHolds(ElementsAre("'", "Hello", "'"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + + // Alphanumeric terms are allowed + EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"), + IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + + // Alphanumeric terms are allowed + EXPECT_THAT( + language_segmenter->GetAllTerms("3.141592653589793238462643383279"), + IsOkAndHolds(ElementsAre("3.141592653589793238462643383279"))); + + EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"), + IsOkAndHolds(ElementsAre("3,456.789"))); + + EXPECT_THAT(language_segmenter->GetAllTerms("-123"), + IsOkAndHolds(ElementsAre("-", "123"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Multiple continuous whitespaces are treated as one. + const int kNumSeparators = 256; + std::string text_with_spaces = + absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World"); + EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces), + IsOkAndHolds(ElementsAre("Hello", " ", "World"))); + + // Multiple continuous whitespaces are treated as one. Whitespace at the + // beginning of the text doesn't affect the results of GetTerm() after the + // iterator is done. + text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '), + "Hello", " ", "World"); + ICING_ASSERT_OK_AND_ASSIGN(auto itr, + language_segmenter->Segment(text_with_spaces)); + std::vector<std::string_view> terms; + while (itr->Advance()) { + terms.push_back(itr->GetTerm()); + } + EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World")); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't + // have whitespaces as word delimiter. + + // Chinese + EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"), + IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班"))); + // Japanese + EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"), + IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩", + "い", "てい", "ます"))); + // Khmer + EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), + IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ"))); + // Thai + EXPECT_THAT( + language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"), + IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"), + IsOkAndHolds(ElementsAre("āăąḃḅḇčćç"))); +} + +// TODO(samzheng): test cases for more languages (e.g. top 20 in the world) +TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Turkish + EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"), + IsOkAndHolds(ElementsAre("merhaba", " ", "dünya"))); + // Korean + EXPECT_THAT( + language_segmenter->GetAllTerms("나는 매일 출근합니다."), + IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", "."))); +} + +// TODO(samzheng): more mixed languages test cases +TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"), + IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好", + "吗", "お", "元気", "です", "か"))); + + EXPECT_THAT( + language_segmenter->GetAllTerms("나는 California에 산다"), + IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다"))); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Validates that the input strings are not copied + const std::string text = "Hello World"; + const char* word1_address = text.c_str(); + const char* word2_address = text.c_str() + 6; + ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms, + language_segmenter->GetAllTerms(text)); + ASSERT_THAT(terms, ElementsAre("Hello", " ", "World")); + const char* word1_result_address = terms.at(0).data(); + const char* word2_result_address = terms.at(2).data(); + + // The underlying char* should be the same + EXPECT_THAT(word1_address, Eq(word1_result_address)); + EXPECT_THAT(word2_address, Eq(word2_result_address)); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8))); + ASSERT_THAT(itr->GetTerm(), Eq("you")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(itr->GetTerm(), Eq("you")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(itr->GetTerm(), Eq("you")); +} + +// Tests that ResetToTermAfter and Advance produce the same output. With the +// exception of the first term which is inacessible via ResetToTermAfter, +// the stream of terms produced by Advance calls should exacly match the +// terms produced by ResetToTermAfter calls with the current position +// provided as the argument. +TEST_P(IcuLanguageSegmenterAllLocalesTest, + MixedLanguagesResetToTermAfterEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetAfter(reset_to_term_itr.get()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + ThaiResetToTermAfterEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetAfter(reset_to_term_itr.get()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + KoreanResetToTermAfterEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kKorean = "나는 매일 출근합니다."; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetAfter(reset_to_term_itr.get()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as +// ResetToTermAfter(current_position) can be used to simulate Advance, users +// should be able to mix ResetToTermAfter(current_position) calls and Advance +// calls to mimic calling Advance. +TEST_P(IcuLanguageSegmenterAllLocalesTest, + MixedLanguagesResetToTermAfterInteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> advance_and_reset_terms = + GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + + EXPECT_THAT(advance_and_reset_terms, + testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + ThaiResetToTermAfterInteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> advance_and_reset_terms = + GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + + EXPECT_THAT(advance_and_reset_terms, + testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + KoreanResetToTermAfterInteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kKorean = "나는 매일 출근합니다."; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> advance_and_reset_terms = + GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + + EXPECT_THAT(advance_and_reset_terms, + testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment("How are you你好吗お元気ですか")); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11))); + EXPECT_THAT(itr->GetTerm(), Eq("你好")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->GetTerm(), Eq("you")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35))); + EXPECT_THAT(itr->GetTerm(), Eq("か")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17))); + EXPECT_THAT(itr->GetTerm(), Eq("吗")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(35), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + ContinuousWhitespacesResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Multiple continuous whitespaces are treated as one. + constexpr std::string_view kTextWithSpace = "Hello World"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kTextWithSpace)); + + // String: "Hello World" + // ^ ^ ^ + // Bytes: 0 5 15 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->GetTerm(), Eq("World")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->GetTerm(), Eq("World")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(15), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(17), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(19), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that + // don't have whitespaces as word delimiter. Chinese + constexpr std::string_view kChinese = "我每天走路去上班。"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kChinese)); + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // Bytes: 0 3 9 15 18 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq("每天")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9))); + EXPECT_THAT(itr->GetTerm(), Eq("走路")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(19), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Japanese + constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kJapanese)); + // String: "私は毎日仕事に歩いています。" + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 6 12 18212427 33 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq("は")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(33), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12))); + EXPECT_THAT(itr->GetTerm(), Eq("仕事")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kKhmer)); + // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" + // ^ ^ ^ ^ + // Bytes: 0 9 24 45 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9))); + EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(47), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24))); + EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Thai + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kThai)); + // String: "ฉันเดินไปทำงานทุกวัน" + // ^ ^ ^ ^ ^ ^ + // Bytes: 0 9 21 27 42 51 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9))); + EXPECT_THAT(itr->GetTerm(), Eq("เดิน")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(51), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21))); + EXPECT_THAT(itr->GetTerm(), Eq("ไป")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42))); + EXPECT_THAT(itr->GetTerm(), Eq("ทุก")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4))); + ASSERT_THAT(itr->GetTerm(), Eq("are")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(itr->GetTerm(), Eq("are")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(itr->GetTerm(), Eq("are")); +} + +// Tests that ResetToTermBefore and Advance produce the same output. With the +// exception of the last term which is inacessible via ResetToTermBefore, +// the stream of terms produced by Advance calls should exacly match the +// terms produced by ResetToTermBefore calls with the current position +// provided as the argument (after their order has been reversed). +TEST_P(IcuLanguageSegmenterAllLocalesTest, + MixedLanguagesResetToTermBeforeEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + // Can't produce the last term via calls to ResetToTermBefore. So skip + // past that one. + auto itr = advance_terms.begin(); + std::advance(itr, advance_terms.size() - 1); + advance_terms.erase(itr); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetBefore(reset_to_term_itr.get()); + std::reverse(reset_terms.begin(), reset_terms.end()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty()); + EXPECT_THAT(advance_itr->GetTerm(), IsEmpty()); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + ThaiResetToTermBeforeEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + // Can't produce the last term via calls to ResetToTermBefore. So skip + // past that one. + auto itr = advance_terms.begin(); + std::advance(itr, advance_terms.size() - 1); + advance_terms.erase(itr); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetBefore(reset_to_term_itr.get()); + std::reverse(reset_terms.begin(), reset_terms.end()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + KoreanResetToTermBeforeEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kKorean = "나는 매일 출근합니다."; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + // Can't produce the last term via calls to ResetToTermBefore. So skip + // past that one. + auto itr = advance_terms.begin(); + std::advance(itr, advance_terms.size() - 1); + advance_terms.erase(itr); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetBefore(reset_to_term_itr.get()); + std::reverse(reset_terms.begin(), reset_terms.end()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment("How are you你好吗お元気ですか")); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + EXPECT_THAT(itr->ResetToTermEndingBefore(2), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4))); + EXPECT_THAT(itr->GetTerm(), Eq("are")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23))); + EXPECT_THAT(itr->GetTerm(), Eq("元気")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->GetTerm(), Eq("you")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29))); + EXPECT_THAT(itr->GetTerm(), Eq("です")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + ContinuousWhitespacesResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Multiple continuous whitespaces are treated as one. + constexpr std::string_view kTextWithSpace = "Hello World"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kTextWithSpace)); + + // String: "Hello World" + // ^ ^ ^ + // Bytes: 0 5 15 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(2), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("Hello")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("Hello")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that + // don't have whitespaces as word delimiter. Chinese + constexpr std::string_view kChinese = "我每天走路去上班。"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kChinese)); + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // Bytes: 0 3 9 15 18 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("我")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->GetTerm(), Eq("去")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Japanese + constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kJapanese)); + // String: "私は毎日仕事に歩いています。" + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 6 12 18212427 33 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27))); + EXPECT_THAT(itr->GetTerm(), Eq("てい")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq("は")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kKhmer)); + // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" + // ^ ^ ^ ^ + // Bytes: 0 9 24 45 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24))); + EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("ញុំ")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Thai + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kThai)); + // String: "ฉันเดินไปทำงานทุกวัน" + // ^ ^ ^ ^ ^ ^ + // Bytes: 0 9 21 27 42 51 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42))); + EXPECT_THAT(itr->GetTerm(), Eq("ทุก")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("ฉัน")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21))); + EXPECT_THAT(itr->GetTerm(), Eq("ไป")); +} + +INSTANTIATE_TEST_SUITE_P( + LocaleName, IcuLanguageSegmenterAllLocalesTest, + testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH, + ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN, + ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE, + ULOC_TRADITIONAL_CHINESE, + "es_ES", // Spanish + "hi_IN", // Hindi + "th_TH", // Thai + "lo_LA", // Lao + "km_KH", // Khmer + "ar_DZ", // Arabic + "ru_RU", // Russian + "pt_PT", // Portuguese + "en_US_POSIX" // American English (Computer) + "wrong_locale" // Will fall back to ICU default locale + "" // Will fall back to ICU default locale + )); + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h index 244bcd8..5a4047b 100644 --- a/icing/tokenization/language-segmenter-factory.h +++ b/icing/tokenization/language-segmenter-factory.h @@ -18,19 +18,24 @@ #include <memory> #include <string_view> +#include "icing/jni/jni-cache.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/tokenization/language-segmenter.h" -#include "icing/util/icu-i18n-utils.h" +#include "icing/util/i18n-utils.h" +#include "unicode/uloc.h" namespace icing { namespace lib { namespace language_segmenter_factory { -enum SegmenterType { - ICU4C, // Uses the ICU library to segment text. - SPACE, // Segments only on whitespace. Currently not used in production; used - // to compile in Jetpack +struct SegmenterOptions { + explicit SegmenterOptions(std::string locale = ULOC_US, + const JniCache* jni_cache = nullptr) + : locale(std::move(locale)), jni_cache(jni_cache) {} + + std::string locale; + const JniCache* jni_cache; }; // Creates a language segmenter with the given locale. @@ -39,7 +44,7 @@ enum SegmenterType { // A LanguageSegmenter on success // INVALID_ARGUMENT if locale string is invalid libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create( - SegmenterType type, std::string locale = ULOC_US); + SegmenterOptions options = SegmenterOptions()); } // namespace language_segmenter_factory diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc index 6af44e1..c7b068d 100644 --- a/icing/tokenization/language-segmenter-iterator_test.cc +++ b/icing/tokenization/language-segmenter-iterator_test.cc @@ -15,7 +15,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -32,8 +32,7 @@ using ::testing::Eq; // don't need to stress test the implementation's definition of a term. These // test that it advances and traverses through simple terms consistently between // all the implementations. -class LanguageSegmenterIteratorTest - : public testing::TestWithParam<language_segmenter_factory::SegmenterType> { +class LanguageSegmenterIteratorTest : public testing::Test { protected: void SetUp() override { ICING_ASSERT_OK( @@ -41,15 +40,11 @@ class LanguageSegmenterIteratorTest icu_data_file_helper::SetUpICUDataFile( GetTestFilePath("icing/icu.dat"))); } - - static language_segmenter_factory::SegmenterType GetType() { - return GetParam(); - } }; -TEST_P(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) { +TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) { ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -65,10 +60,10 @@ TEST_P(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) { EXPECT_FALSE(iterator->Advance()); } -TEST_P(LanguageSegmenterIteratorTest, +TEST_F(LanguageSegmenterIteratorTest, ResetToTermStartingAfterWithOffsetInText) { ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -80,45 +75,48 @@ TEST_P(LanguageSegmenterIteratorTest, StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_P(LanguageSegmenterIteratorTest, - ResetToTermStartingAfterWithNegativeOffsetOk) { +TEST_F(LanguageSegmenterIteratorTest, + ResetToTermStartingAfterWithNegativeOffsetNotOk) { ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-1), - IsOkAndHolds(0)); // The term "foo" + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-100), - IsOkAndHolds(0)); // The term "foo" + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + EXPECT_THAT(iterator->ResetToStart(), IsOkAndHolds(0)); + EXPECT_THAT(iterator->GetTerm(), Eq("foo")); } -TEST_P(LanguageSegmenterIteratorTest, - ResetToTermStartingAfterWithTextLengthOffsetNotFound) { +TEST_F(LanguageSegmenterIteratorTest, + ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) { std::string text = "foo bar"; ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_P(LanguageSegmenterIteratorTest, - ResetToTermStartingAfterWithOffsetPastTextLengthNotFound) { +TEST_F(LanguageSegmenterIteratorTest, + ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) { std::string text = "foo bar"; ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_P(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) { +TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) { ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -130,50 +128,46 @@ TEST_P(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_P(LanguageSegmenterIteratorTest, - ResetToTermEndingBeforeWithZeroOrNegativeOffsetNotFound) { +TEST_F(LanguageSegmenterIteratorTest, + ResetToTermEndingBeforeWithZeroNotFound) { ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); + // Zero is a valid argument, but there aren't any terms that end before it. EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(LanguageSegmenterIteratorTest, + ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); + ICING_ASSERT_OK_AND_ASSIGN(auto iterator, + language_segmenter->Segment("foo bar")); EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-1), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-100), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_P(LanguageSegmenterIteratorTest, - ResetToTermEndingBeforeWithTextLengthOffsetOk) { +TEST_F(LanguageSegmenterIteratorTest, + ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) { std::string text = "foo bar"; ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()), - IsOkAndHolds(4)); // The term "bar" -} - -TEST_P(LanguageSegmenterIteratorTest, - ResetToTermEndingBeforeWithOffsetPastTextLengthNotFound) { - std::string text = "foo bar"; - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetType())); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length() + 1), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -INSTANTIATE_TEST_SUITE_P( - SegmenterType, LanguageSegmenterIteratorTest, - testing::Values(language_segmenter_factory::SegmenterType::ICU4C, - language_segmenter_factory::SegmenterType::SPACE)); - } // namespace } // namespace lib } // namespace icing diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h index fde9ae2..fdb1846 100644 --- a/icing/tokenization/language-segmenter.h +++ b/icing/tokenization/language-segmenter.h @@ -64,16 +64,18 @@ class LanguageSegmenter { // iterator.ResetToTermStartingAfter(4); // iterator.GetTerm() // returns "baz"; // - // Passing in a negative offset will return the offset of the first term. - // - // Passing in an offset that is equal to or exceeds the underlying text - // length will return NOT_FOUND. + // Return types of OK and NOT_FOUND indicate that the function call was + // valid and the state of the iterator has changed. Return type of + // INVALID_ARGUMENT will leave the iterator unchanged. // // Returns: // On success, the starting position of the first term that starts after // offset. // NOT_FOUND if an error occurred or there are no terms that start after // offset. + // INVALID_ARGUMENT if offset is out of bounds for the provided text. + // ABORTED if an invalid unicode character is encountered while + // traversing the text. virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter( int32_t offset) = 0; @@ -85,21 +87,22 @@ class LanguageSegmenter { // iterator.ResetToTermEndingBefore(7); // iterator.GetTerm() // returns "bar"; // - // Passing in an offset equal to or less than 0 will return NOT_FOUND. - // - // Passing in an offset equal to the underlying text length will return the - // offset of the last term. - // - // Passing in an offset that is greater than the underlying text length will - // return NOT_FOUND. + // Return types of OK and NOT_FOUND indicate that the function call was + // valid and the state of the iterator has changed. Return type of + // INVALID_ARGUMENT will leave the iterator unchanged. // // Returns: // On success, the starting position of the first term that ends before // offset. // NOT_FOUND if an error occurred or there are no terms that ends before // offset. + // INVALID_ARGUMENT if offset is out of bounds for the provided text. + // ABORTED if an invalid unicode character is encountered while + // traversing the text. virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore( int32_t offset) = 0; + + virtual libtextclassifier3::StatusOr<int32_t> ResetToStart() = 0; }; // Segments the input text into terms. diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc index 01cc938..49ddfca 100644 --- a/icing/tokenization/language-segmenter_benchmark.cc +++ b/icing/tokenization/language-segmenter_benchmark.cc @@ -14,7 +14,7 @@ #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -60,8 +60,7 @@ void BM_SegmentNoSpace(benchmark::State& state) { } std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::string input_string(state.range(0), 'A'); @@ -97,8 +96,7 @@ void BM_SegmentWithSpaces(benchmark::State& state) { } std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::string input_string(state.range(0), 'A'); for (int i = 1; i < input_string.length(); i += 2) { @@ -137,8 +135,7 @@ void BM_SegmentCJK(benchmark::State& state) { } std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create(language_segmenter_factory::ICU4C) - .ValueOrDie(); + language_segmenter_factory::Create().ValueOrDie(); std::string input_string; while (input_string.length() < state.range(0)) { diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc index 556a095..6e54af9 100644 --- a/icing/tokenization/plain-tokenizer.cc +++ b/icing/tokenization/plain-tokenizer.cc @@ -18,7 +18,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/tokenization/language-segmenter.h" -#include "icing/util/icu-i18n-utils.h" +#include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" namespace icing { @@ -39,8 +39,8 @@ bool IsValidTerm(std::string_view term) { } // Gets the first unicode character. We can know what the whole term is by // checking only the first character. - return !icu_i18n_utils::IsWhitespaceAt(term, /*position=*/0) && - !icu_i18n_utils::IsPunctuationAt(term, /*position=*/0); + return !i18n_utils::IsWhitespaceAt(term, /*position=*/0) && + !i18n_utils::IsPunctuationAt(term, /*position=*/0); } } // namespace @@ -96,6 +96,18 @@ class PlainTokenIterator : public Tokenizer::Iterator { return true; } + bool ResetToStart() override { + if (!base_iterator_->ResetToStart().ok()) { + return false; + } + current_term_ = base_iterator_->GetTerm(); + if (!IsValidTerm(current_term_)) { + // If the current value isn't valid, advance to the next valid value. + return Advance(); + } + return true; + } + private: std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_; std::string_view current_term_; diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc index e7d6e29..f2fc678 100644 --- a/icing/tokenization/plain-tokenizer_test.cc +++ b/icing/tokenization/plain-tokenizer_test.cc @@ -18,7 +18,7 @@ #include "gmock/gmock.h" #include "icing/absl_ports/str_cat.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/test-data.h" @@ -49,9 +49,8 @@ TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) { } TEST_F(PlainTokenizerTest, Simple) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -82,9 +81,8 @@ TEST_F(PlainTokenizerTest, Simple) { } TEST_F(PlainTokenizerTest, Whitespace) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -109,9 +107,8 @@ TEST_F(PlainTokenizerTest, Whitespace) { } TEST_F(PlainTokenizerTest, Punctuation) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -139,9 +136,8 @@ TEST_F(PlainTokenizerTest, Punctuation) { } TEST_F(PlainTokenizerTest, SpecialCharacters) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -161,9 +157,8 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) { } TEST_F(PlainTokenizerTest, CJKT) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -214,9 +209,8 @@ TEST_F(PlainTokenizerTest, CJKT) { } TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -232,9 +226,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { } TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -250,9 +243,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { } TEST_F(PlainTokenizerTest, ResetToTokenAfter) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -299,9 +291,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) { } TEST_F(PlainTokenizerTest, ResetToTokenBefore) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc index 6819f8d..8b2edc9 100644 --- a/icing/tokenization/raw-query-tokenizer.cc +++ b/icing/tokenization/raw-query-tokenizer.cc @@ -29,7 +29,7 @@ #include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/token.h" #include "icing/tokenization/tokenizer.h" -#include "icing/util/icu-i18n-utils.h" +#include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" // This file provides rules that tell the tokenizer what to do when it sees a @@ -316,7 +316,7 @@ TermType GetTermType(std::string_view term) { return OR_OPERATOR; } // Checks the first char to see if it's an ASCII term - if (icu_i18n_utils::IsAscii(term[0])) { + if (i18n_utils::IsAscii(term[0])) { if (std::isalnum(term[0])) { return ALPHANUMERIC_TERM; } @@ -381,7 +381,7 @@ libtextclassifier3::Status OutputToken(State new_state, case ALPHANUMERIC_TERM: if (new_state == PROCESSING_PROPERTY_TERM) { // Asserts extra rule 1: property name must be in ASCII - if (!icu_i18n_utils::IsAscii(current_term[0])) { + if (!i18n_utils::IsAscii(current_term[0])) { return absl_ports::InvalidArgumentError( "Characters in property name must all be ASCII."); } diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index dfcc09a..351f7c1 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -16,7 +16,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -46,9 +46,8 @@ TEST_F(RawQueryTokenizerTest, CreationWithNullPointerShouldFail) { } TEST_F(RawQueryTokenizerTest, Simple) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -60,9 +59,8 @@ TEST_F(RawQueryTokenizerTest, Simple) { } TEST_F(RawQueryTokenizerTest, Parentheses) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -161,9 +159,8 @@ TEST_F(RawQueryTokenizerTest, Parentheses) { } TEST_F(RawQueryTokenizerTest, Exclustion) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -229,9 +226,8 @@ TEST_F(RawQueryTokenizerTest, Exclustion) { } TEST_F(RawQueryTokenizerTest, PropertyRestriction) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -318,9 +314,8 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) { } TEST_F(RawQueryTokenizerTest, OR) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -440,9 +435,8 @@ TEST_F(RawQueryTokenizerTest, OR) { // CJKT are treated the same way by language segmenter and raw tokenizer, so // here we test Chinese and Japanese to represent CJKT. TEST_F(RawQueryTokenizerTest, CJKT) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -494,9 +488,8 @@ TEST_F(RawQueryTokenizerTest, CJKT) { // Raw tokenizer identifies all characters that it doesn't know as OTHER type, // so we can choose comma "," to represent all OTHER characters. TEST_F(RawQueryTokenizerTest, OtherChars) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -540,9 +533,8 @@ TEST_F(RawQueryTokenizerTest, OtherChars) { } TEST_F(RawQueryTokenizerTest, Mix) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::ICU4C)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc new file mode 100644 index 0000000..f79bc68 --- /dev/null +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc @@ -0,0 +1,62 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/absl_ports/canonical_errors.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h" +#include "icing/util/logging.h" + +namespace icing { +namespace lib { + +namespace language_segmenter_factory { + +namespace { +constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX"; +} // namespace + +// Creates a language segmenter with the given locale. +// +// Returns: +// A LanguageSegmenter on success +// INVALID_ARGUMENT if locale string is invalid +// +// TODO(samzheng): Figure out if we want to verify locale strings and notify +// users. Right now illegal locale strings will be ignored by ICU. ICU +// components will be created with its default locale. +libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create( + SegmenterOptions options) { + if (options.jni_cache == nullptr) { + return absl_ports::InvalidArgumentError( + "Cannot create Reverse Jni Language Segmenter without a valid JniCache " + "pointer"); + } + // Word connector rules for "en_US_POSIX" (American English (Computer)) are + // different from other locales. E.g. "email.subject" will be split into 3 + // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one + // term in other locales. Our current LanguageSegmenter doesn't handle this + // special rule, so we replace it with "en_US". + if (options.locale == kLocaleAmericanEnglishComputer) { + ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer + << " not supported. Converting to locale " << ULOC_US; + options.locale = ULOC_US; + } + return std::make_unique<ReverseJniLanguageSegmenter>( + std::move(options.locale), options.jni_cache); +} + +} // namespace language_segmenter_factory + +} // namespace lib +} // namespace icing diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc new file mode 100644 index 0000000..8392363 --- /dev/null +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc @@ -0,0 +1,37 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <jni.h> + +#include "gtest/gtest.h" +#include "icing/testing/logging-event-listener.h" + +// Global variable used so that the test implementation can access the JNIEnv. +JNIEnv* g_jenv = nullptr; + +extern "C" JNIEXPORT jboolean JNICALL +Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain( + JNIEnv* env, jclass ignored) { + g_jenv = env; + + std::vector<char*> my_argv; + char arg[] = "reverse-jni-language-segmenter-test-lib"; + my_argv.push_back(arg); + int argc = 1; + char** argv = &(my_argv[0]); + testing::InitGoogleTest(&argc, argv); + testing::UnitTest::GetInstance()->listeners().Append( + new icing::lib::LoggingEventListener()); + return RUN_ALL_TESTS() == 0; +} diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc new file mode 100644 index 0000000..a01d944 --- /dev/null +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc @@ -0,0 +1,1085 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h" + +#include <memory> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "gmock/gmock.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/icu-i18n-test-utils.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" +#include "unicode/uloc.h" + +namespace icing { +namespace lib { + +namespace test_internal { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; + +namespace { + +language_segmenter_factory::SegmenterOptions GetSegmenterOptions( + const std::string& locale, const JniCache* jni_cache) { + return language_segmenter_factory::SegmenterOptions(locale, jni_cache); +} + +// Returns a vector containing all terms retrieved by Advancing on the iterator. +std::vector<std::string_view> GetAllTermsAdvance( + LanguageSegmenter::Iterator* itr) { + std::vector<std::string_view> terms; + while (itr->Advance()) { + terms.push_back(itr->GetTerm()); + } + return terms; +} + +// Returns a vector containing all terms retrieved by calling ResetAfter with +// the current position to simulate Advancing on the iterator. +std::vector<std::string_view> GetAllTermsResetAfter( + LanguageSegmenter::Iterator* itr) { + std::vector<std::string_view> terms; + if (!itr->ResetToStart().ok()) { + return terms; + } + terms.push_back(itr->GetTerm()); + const char* text_begin = itr->GetTerm().data(); + // Calling ResetToTermStartingAfter with the current position should get the + // very next term in the sequence. + for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok(); + current_pos = itr->GetTerm().data() - text_begin) { + terms.push_back(itr->GetTerm()); + } + return terms; +} + +// Returns a vector containing all terms retrieved by alternating calls to +// Advance and calls to ResetAfter with the current position to simulate +// Advancing. +std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter( + LanguageSegmenter::Iterator* itr) { + const char* text_begin = itr->GetTerm().data(); + std::vector<std::string_view> terms; + + bool is_ok = true; + int current_pos = 0; + while (is_ok) { + // Alternate between using Advance and ResetToTermAfter. + if (terms.size() % 2 == 0) { + is_ok = itr->Advance(); + } else { + // Calling ResetToTermStartingAfter with the current position should get + // the very next term in the sequence. + current_pos = itr->GetTerm().data() - text_begin; + is_ok = itr->ResetToTermStartingAfter(current_pos).ok(); + } + if (is_ok) { + terms.push_back(itr->GetTerm()); + } + } + return terms; +} + +// Returns a vector containing all terms retrieved by calling ResetBefore with +// the current position, starting at the end of the text. This vector should be +// in reverse order of GetAllTerms and missing the last term. +std::vector<std::string_view> GetAllTermsResetBefore( + LanguageSegmenter::Iterator* itr) { + const char* text_begin = itr->GetTerm().data(); + int last_pos = 0; + while (itr->Advance()) { + last_pos = itr->GetTerm().data() - text_begin; + } + std::vector<std::string_view> terms; + // Calling ResetToTermEndingBefore with the current position should get the + // previous term in the sequence. + for (int current_pos = last_pos; + itr->ResetToTermEndingBefore(current_pos).ok(); + current_pos = itr->GetTerm().data() - text_begin) { + terms.push_back(itr->GetTerm()); + } + return terms; +} + +} // namespace + +TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty())); +} + +TEST_P(ReverseJniLanguageSegmenterTest, SimpleText) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"), + IsOkAndHolds(ElementsAre("Hello", " ", "World"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ASCII_Punctuation) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // ASCII punctuation marks are kept + EXPECT_THAT( + language_segmenter->GetAllTerms("Hello, World!!!"), + IsOkAndHolds(ElementsAre("Hello", ",", " ", "World", "!", "!", "!"))); + EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"), + IsOkAndHolds(ElementsAre("Open", "-", "source", " ", "project"))); + EXPECT_THAT(language_segmenter->GetAllTerms("100%"), + IsOkAndHolds(ElementsAre("100", "%"))); + EXPECT_THAT(language_segmenter->GetAllTerms("A&B"), + IsOkAndHolds(ElementsAre("A", "&", "B"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ASCII_SpecialCharacter) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // ASCII special characters are kept + EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"), + IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000"))); + EXPECT_THAT(language_segmenter->GetAllTerms("A+B"), + IsOkAndHolds(ElementsAre("A", "+", "B"))); + // 0x0009 is the unicode for tab (within ASCII range). + std::string text_with_tab = absl_ports::StrCat( + "Hello", UCharToString(0x0009), UCharToString(0x0009), "World"); + EXPECT_THAT(language_segmenter->GetAllTerms(text_with_tab), + IsOkAndHolds(ElementsAre("Hello", UCharToString(0x0009), + UCharToString(0x0009), "World"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Full-width (non-ASCII) punctuation marks and special characters are left + // out. + EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"), + IsOkAndHolds(ElementsAre("Hello"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, Acronym) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + EXPECT_THAT(language_segmenter->GetAllTerms("U.S.𡔖 Bank"), + IsOkAndHolds(ElementsAre("U.S", ".", "𡔖", " ", "Bank"))); + EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."), + IsOkAndHolds(ElementsAre("I.B.M", "."))); + EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"), + IsOkAndHolds(ElementsAre("I", ",", "B", ",", "M"))); + EXPECT_THAT(language_segmenter->GetAllTerms("I B M"), + IsOkAndHolds(ElementsAre("I", " ", "B", " ", "M"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // According to unicode word break rules + // WB6(https://unicode.org/reports/tr29/#WB6), + // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some + // punctuation characters are used as word connecters. That is, words don't + // break before and after them. Here we just test some that we care about. + + // Word connecters + EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"), + IsOkAndHolds(ElementsAre("com.google.android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"), + IsOkAndHolds(ElementsAre("com:google:android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"), + IsOkAndHolds(ElementsAre("com'google'android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"), + IsOkAndHolds(ElementsAre("com_google_android"))); + + // Word connecters can be mixed + EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"), + IsOkAndHolds(ElementsAre("com.google.android:icing"))); + + // Any heading and trailing characters are not connecters + EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."), + IsOkAndHolds(ElementsAre(".", "com.google.android", "."))); + + // Not word connecters + EXPECT_THAT(language_segmenter->GetAllTerms("com,google,android"), + IsOkAndHolds(ElementsAre("com", ",", "google", ",", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com-google-android"), + IsOkAndHolds(ElementsAre("com", "-", "google", "-", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com+google+android"), + IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"), + IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"), + IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"), + IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"), + IsOkAndHolds(ElementsAre("com", "&", "google", "&", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com|google|android"), + IsOkAndHolds(ElementsAre("com", "|", "google", "|", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com/google/android"), + IsOkAndHolds(ElementsAre("com", "/", "google", "/", "android"))); + EXPECT_THAT(language_segmenter->GetAllTerms("com;google;android"), + IsOkAndHolds(ElementsAre("com", ";", "google", ";", "android"))); + EXPECT_THAT( + language_segmenter->GetAllTerms("com\"google\"android"), + IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, Apostrophes) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."), + IsOkAndHolds(ElementsAre("It's", " ", "ok", "."))); + EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."), + IsOkAndHolds(ElementsAre("He'll", " ", "be", " ", "back", "."))); + EXPECT_THAT(language_segmenter->GetAllTerms("'Hello 'World."), + IsOkAndHolds(ElementsAre("'", "Hello", " ", "'", "World", "."))); + EXPECT_THAT(language_segmenter->GetAllTerms("The dogs' bone"), + IsOkAndHolds(ElementsAre("The", " ", "dogs", "'", " ", "bone"))); + // 0x2019 is the single right quote, should be treated the same as "'" + std::string token_with_quote = + absl_ports::StrCat("He", UCharToString(0x2019), "ll"); + std::string text_with_quote = + absl_ports::StrCat(token_with_quote, " be back."); + EXPECT_THAT( + language_segmenter->GetAllTerms(text_with_quote), + IsOkAndHolds(ElementsAre(token_with_quote, " ", "be", " ", "back", "."))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, Parentheses) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + + EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"), + IsOkAndHolds(ElementsAre("(", "Hello", ")"))); + + EXPECT_THAT(language_segmenter->GetAllTerms(")Hello("), + IsOkAndHolds(ElementsAre(")", "Hello", "("))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, Quotes) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + + EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""), + IsOkAndHolds(ElementsAre("\"", "Hello", "\""))); + + EXPECT_THAT(language_segmenter->GetAllTerms("'Hello'"), + IsOkAndHolds(ElementsAre("'", "Hello", "'"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, Alphanumeric) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + + // Alphanumeric terms are allowed + EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"), + IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, Number) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + + // Alphanumeric terms are allowed + EXPECT_THAT( + language_segmenter->GetAllTerms("3.141592653589793238462643383279"), + IsOkAndHolds(ElementsAre("3.141592653589793238462643383279"))); + + EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"), + IsOkAndHolds(ElementsAre("3,456.789"))); + + EXPECT_THAT(language_segmenter->GetAllTerms("-123"), + IsOkAndHolds(ElementsAre("-", "123"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Multiple continuous whitespaces are treated as one. + const int kNumSeparators = 256; + std::string text_with_spaces = + absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World"); + EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces), + IsOkAndHolds(ElementsAre("Hello", " ", "World"))); + + // Multiple continuous whitespaces are treated as one. Whitespace at the + // beginning of the text doesn't affect the results of GetTerm() after the + // iterator is done. + text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '), + "Hello", " ", "World"); + ICING_ASSERT_OK_AND_ASSIGN(auto itr, + language_segmenter->Segment(text_with_spaces)); + std::vector<std::string_view> terms; + while (itr->Advance()) { + terms.push_back(itr->GetTerm()); + } + EXPECT_THAT(terms, ElementsAre(" ", "Hello", " ", "World")); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); +} + +TEST_P(ReverseJniLanguageSegmenterTest, CJKT) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't + // have whitespaces as word delimiter. + + // Chinese + EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"), + IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班"))); + // Japanese + EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"), + IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩", + "い", "てい", "ます"))); + // Khmer + EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), + IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ"))); + // Thai + EXPECT_THAT( + language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"), + IsOkAndHolds(ElementsAre("ฉัน", "เดิน", "ไป", "ทำงาน", "ทุก", "วัน"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, LatinLettersWithAccents) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"), + IsOkAndHolds(ElementsAre("āăąḃḅḇčćç"))); +} + +// TODO(samzheng): test cases for more languages (e.g. top 20 in the world) +TEST_P(ReverseJniLanguageSegmenterTest, WhitespaceSplitLanguages) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Turkish + EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"), + IsOkAndHolds(ElementsAre("merhaba", " ", "dünya"))); + // Korean + EXPECT_THAT( + language_segmenter->GetAllTerms("나는 매일 출근합니다."), + IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", "."))); +} + +// TODO(samzheng): more mixed languages test cases +TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguages) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"), + IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好", + "吗", "お", "元気", "です", "か"))); + + EXPECT_THAT( + language_segmenter->GetAllTerms("나는 California에 산다"), + IsOkAndHolds(ElementsAre("나는", " ", "California", "에", " ", "산다"))); +} + +TEST_P(ReverseJniLanguageSegmenterTest, NotCopyStrings) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Validates that the input strings are not copied + const std::string text = "Hello World"; + const char* word1_address = text.c_str(); + const char* word2_address = text.c_str() + 6; + ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms, + language_segmenter->GetAllTerms(text)); + ASSERT_THAT(terms, ElementsAre("Hello", " ", "World")); + const char* word1_result_address = terms.at(0).data(); + const char* word2_result_address = terms.at(2).data(); + + // The underlying char* should be the same + EXPECT_THAT(word1_address, Eq(word1_result_address)); + EXPECT_THAT(word2_address, Eq(word2_result_address)); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8))); + ASSERT_THAT(itr->GetTerm(), Eq("you")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(itr->GetTerm(), Eq("you")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(itr->GetTerm(), Eq("you")); +} + +// Tests that ResetToTermAfter and Advance produce the same output. With the +// exception of the first term which is inacessible via ResetToTermAfter, +// the stream of terms produced by Advance calls should exacly match the +// terms produced by ResetToTermAfter calls with the current position +// provided as the argument. +TEST_P(ReverseJniLanguageSegmenterTest, + MixedLanguagesResetToTermAfterEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetAfter(reset_to_term_itr.get()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(ReverseJniLanguageSegmenterTest, + ThaiResetToTermAfterEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetAfter(reset_to_term_itr.get()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(ReverseJniLanguageSegmenterTest, + KoreanResetToTermAfterEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kKorean = "나는 매일 출근합니다."; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetAfter(reset_to_term_itr.get()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +// Tests that ResetToTermAfter and Advance can be used in conjunction. Just as +// ResetToTermAfter(current_position) can be used to simulate Advance, users +// should be able to mix ResetToTermAfter(current_position) calls and Advance +// calls to mimic calling Advance. +TEST_P(ReverseJniLanguageSegmenterTest, + MixedLanguagesResetToTermAfterInteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> advance_and_reset_terms = + GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + + EXPECT_THAT(advance_and_reset_terms, + testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(ReverseJniLanguageSegmenterTest, + ThaiResetToTermAfterInteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> advance_and_reset_terms = + GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + + EXPECT_THAT(advance_and_reset_terms, + testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(ReverseJniLanguageSegmenterTest, + KoreanResetToTermAfterInteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kKorean = "나는 매일 출근합니다."; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> advance_and_reset_terms = + GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + + EXPECT_THAT(advance_and_reset_terms, + testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment("How are you你好吗お元気ですか")); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11))); + EXPECT_THAT(itr->GetTerm(), Eq("你好")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->GetTerm(), Eq("you")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35))); + EXPECT_THAT(itr->GetTerm(), Eq("か")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17))); + EXPECT_THAT(itr->GetTerm(), Eq("吗")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(35), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Multiple continuous whitespaces are treated as one. + constexpr std::string_view kTextWithSpace = "Hello World"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kTextWithSpace)); + + // String: "Hello World" + // ^ ^ ^ + // Bytes: 0 5 15 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->GetTerm(), Eq("World")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->GetTerm(), Eq("World")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(15), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(17), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(19), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that + // don't have whitespaces as word delimiter. Chinese + constexpr std::string_view kChinese = "我每天走路去上班。"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kChinese)); + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // Bytes: 0 3 9 15 18 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq("每天")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9))); + EXPECT_THAT(itr->GetTerm(), Eq("走路")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(19), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); +} + +TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Japanese + constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kJapanese)); + // String: "私は毎日仕事に歩いています。" + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 6 12 18212427 33 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq("は")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(33), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12))); + EXPECT_THAT(itr->GetTerm(), Eq("仕事")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kKhmer)); + // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" + // ^ ^ ^ ^ + // Bytes: 0 9 24 45 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9))); + EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(47), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24))); + EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Thai + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kThai)); + // String: "ฉันเดินไปทำงานทุกวัน" + // ^ ^ ^ ^ ^ ^ + // Bytes: 0 9 21 27 42 51 + EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9))); + EXPECT_THAT(itr->GetTerm(), Eq("เดิน")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(51), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21))); + EXPECT_THAT(itr->GetTerm(), Eq("ไป")); + + EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42))); + EXPECT_THAT(itr->GetTerm(), Eq("ทุก")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4))); + ASSERT_THAT(itr->GetTerm(), Eq("are")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(itr->GetTerm(), Eq("are")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(itr->GetTerm(), Eq("are")); +} + +// Tests that ResetToTermBefore and Advance produce the same output. With the +// exception of the last term which is inacessible via ResetToTermBefore, +// the stream of terms produced by Advance calls should exacly match the +// terms produced by ResetToTermBefore calls with the current position +// provided as the argument (after their order has been reversed). +TEST_P(ReverseJniLanguageSegmenterTest, + MixedLanguagesResetToTermBeforeEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + // Can't produce the last term via calls to ResetToTermBefore. So skip + // past that one. + auto itr = advance_terms.begin(); + std::advance(itr, advance_terms.size() - 1); + advance_terms.erase(itr); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kText)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetBefore(reset_to_term_itr.get()); + std::reverse(reset_terms.begin(), reset_terms.end()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), IsEmpty()); + EXPECT_THAT(advance_itr->GetTerm(), IsEmpty()); +} + +TEST_P(ReverseJniLanguageSegmenterTest, + ThaiResetToTermBeforeEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + // Can't produce the last term via calls to ResetToTermBefore. So skip + // past that one. + auto itr = advance_terms.begin(); + std::advance(itr, advance_terms.size() - 1); + advance_terms.erase(itr); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kThai)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetBefore(reset_to_term_itr.get()); + std::reverse(reset_terms.begin(), reset_terms.end()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(ReverseJniLanguageSegmenterTest, + KoreanResetToTermBeforeEquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kKorean = "나는 매일 출근합니다."; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> advance_terms = + GetAllTermsAdvance(advance_itr.get()); + // Can't produce the last term via calls to ResetToTermBefore. So skip + // past that one. + auto itr = advance_terms.begin(); + std::advance(itr, advance_terms.size() - 1); + advance_terms.erase(itr); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, + segmenter->Segment(kKorean)); + std::vector<std::string_view> reset_terms = + GetAllTermsResetBefore(reset_to_term_itr.get()); + std::reverse(reset_terms.begin(), reset_terms.end()); + + EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); + EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); +} + +TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment("How are you你好吗お元気ですか")); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + EXPECT_THAT(itr->ResetToTermEndingBefore(2), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4))); + EXPECT_THAT(itr->GetTerm(), Eq("are")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23))); + EXPECT_THAT(itr->GetTerm(), Eq("元気")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->GetTerm(), Eq("you")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29))); + EXPECT_THAT(itr->GetTerm(), Eq("です")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, + ContinuousWhitespacesResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Multiple continuous whitespaces are treated as one. + constexpr std::string_view kTextWithSpace = "Hello World"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kTextWithSpace)); + + // String: "Hello World" + // ^ ^ ^ + // Bytes: 0 5 15 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(2), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("Hello")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("Hello")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->GetTerm(), Eq(" ")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that + // don't have whitespaces as word delimiter. Chinese + constexpr std::string_view kChinese = "我每天走路去上班。"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kChinese)); + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // Bytes: 0 3 9 15 18 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("我")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->GetTerm(), Eq("去")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Japanese + constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kJapanese)); + // String: "私は毎日仕事に歩いています。" + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 6 12 18212427 33 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27))); + EXPECT_THAT(itr->GetTerm(), Eq("てい")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->GetTerm(), Eq("は")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kKhmer)); + // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" + // ^ ^ ^ ^ + // Bytes: 0 9 24 45 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24))); + EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("ញុំ")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Thai + constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment(kThai)); + // String: "ฉันเดินไปทำงานทุกวัน" + // ^ ^ ^ ^ ^ ^ + // Bytes: 0 9 21 27 42 51 + EXPECT_THAT(itr->ResetToTermEndingBefore(0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(itr->GetTerm(), IsEmpty()); + + EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42))); + EXPECT_THAT(itr->GetTerm(), Eq("ทุก")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("ฉัน")); + + EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21))); + EXPECT_THAT(itr->GetTerm(), Eq("ไป")); +} + +INSTANTIATE_TEST_SUITE_P( + LocaleName, ReverseJniLanguageSegmenterTest, + testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH, + ULOC_FRANCE, ULOC_GERMANY, ULOC_ITALY, ULOC_JAPAN, + ULOC_KOREA, ULOC_SIMPLIFIED_CHINESE, + ULOC_TRADITIONAL_CHINESE, + "es_ES", // Spanish + "hi_IN", // Hindi + "th_TH", // Thai + "lo_LA", // Lao + "km_KH", // Khmer + "ar_DZ", // Arabic + "ru_RU", // Russian + "pt_PT", // Portuguese + "en_US_POSIX" // American English (Computer) + "wrong_locale" // Will fall back to ICU default locale + "" // Will fall back to ICU default locale + )); + +} // namespace test_internal + +} // namespace lib +} // namespace icing diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h new file mode 100644 index 0000000..64b68ec --- /dev/null +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h @@ -0,0 +1,46 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ +#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ + +#include <jni.h> + +#include "icing/jni/jni-cache.h" +#include "gtest/gtest.h" + +extern JNIEnv* g_jenv; + +namespace icing { +namespace lib { + +namespace test_internal { + +class ReverseJniLanguageSegmenterTest + : public testing::TestWithParam<const char*> { + protected: + ReverseJniLanguageSegmenterTest() + : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {} + + static std::string GetLocale() { return GetParam(); } + + std::unique_ptr<JniCache> jni_cache_; +}; + +} // namespace test_internal + +} // namespace lib +} // namespace icing + +#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc new file mode 100644 index 0000000..2256022 --- /dev/null +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -0,0 +1,452 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h" + +#include <cctype> +#include <memory> +#include <string> +#include <string_view> + +#include "icing/jni/reverse-jni-break-iterator.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/tokenization/language-segmenter.h" +#include "icing/util/i18n-utils.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Returns the lead byte of the UTF-8 character that includes the byte at +// current_byte_index within it. +int GetUTF8StartPosition(std::string_view text, int current_byte_index) { + while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) { + --current_byte_index; + } + return current_byte_index; +} + +class CharacterIterator { + public: + explicit CharacterIterator(std::string_view text) + : CharacterIterator(text, 0, 0) {} + CharacterIterator(std::string_view text, int utf8_index, int utf16_index) + : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {} + + // Moves from current position to the character that includes the specified + // UTF-8 index. + // REQUIRES: desired_utf8_index <= text_.length() + // desired_utf8_index is allowed to point one index past the end, but no + // further. + bool AdvanceToUtf8(int desired_utf8_index) { + if (desired_utf8_index > text_.length()) { + // Enforce the requirement. + return false; + } + // Need to work forwards. + while (utf8_index_ < desired_utf8_index) { + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. + return false; + } + int utf8_length = i18n_utils::GetUtf8Length(uchar32); + if (utf8_index_ + utf8_length > desired_utf8_index) { + // Ah! Don't go too far! + break; + } + utf8_index_ += utf8_length; + utf16_index_ += i18n_utils::GetUtf16Length(uchar32); + } + return true; + } + + // Moves from current position to the character that includes the specified + // UTF-8 index. + // REQUIRES: 0 <= desired_utf8_index + bool RewindToUtf8(int desired_utf8_index) { + if (desired_utf8_index < 0) { + // Enforce the requirement. + return false; + } + // Need to work backwards. + while (utf8_index_ > desired_utf8_index) { + --utf8_index_; + utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); + if (utf8_index_ < 0) { + // Somehow, there wasn't a single UTF-8 lead byte at + // requested_byte_index or an earlier byte. + return false; + } + // We've found the start of a unicode char! + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. + return false; + } + utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); + } + return true; + } + + // Advances current position to desired_utf16_index. + // REQUIRES: desired_utf16_index <= text_.utf16_length() + // desired_utf16_index is allowed to point one index past the end, but no + // further. + bool AdvanceToUtf16(int desired_utf16_index) { + while (utf16_index_ < desired_utf16_index) { + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. + return false; + } + int utf16_length = i18n_utils::GetUtf16Length(uchar32); + if (utf16_index_ + utf16_length > desired_utf16_index) { + // Ah! Don't go too far! + break; + } + int utf8_length = i18n_utils::GetUtf8Length(uchar32); + if (utf8_index_ + utf8_length > text_.length()) { + // Enforce the requirement. + return false; + } + utf8_index_ += utf8_length; + utf16_index_ += utf16_length; + } + return true; + } + + // Rewinds current position to desired_utf16_index. + // REQUIRES: 0 <= desired_utf16_index + bool RewindToUtf16(int desired_utf16_index) { + if (desired_utf16_index < 0) { + return false; + } + while (utf16_index_ > desired_utf16_index) { + --utf8_index_; + utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); + // We've found the start of a unicode char! + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. + return false; + } + utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); + } + return true; + } + + bool IsValidCharacter() const { + // Rule 1: all ASCII terms will be returned. + // We know it's a ASCII term by checking the first char. + if (i18n_utils::IsAscii(text_[utf8_index_])) { + return true; + } + + // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. + // We know it's an alphabetic term by checking the first unicode character. + if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) { + return true; + } + + return false; + } + + int utf8_index() const { return utf8_index_; } + int utf16_index() const { return utf16_index_; } + + private: + std::string_view text_; + int utf8_index_; + int utf16_index_; +}; + +} // namespace + +class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { + public: + explicit ReverseJniLanguageSegmenterIterator( + std::string_view text, + std::unique_ptr<ReverseJniBreakIterator> break_iterator) + : break_iterator_(std::move(break_iterator)), + text_(text), + term_start_(text), + term_end_exclusive_(text) {} + + // Advances to the next term. Returns false if it has reached the end. + bool Advance() override { + // Prerequisite check + if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) { + return false; + } + + if (term_end_exclusive_.utf16_index() == 0) { + int first = break_iterator_->First(); + if (!term_start_.AdvanceToUtf16(first)) { + // First is guaranteed to succeed and return a position within bonds. So + // the only possible failure could be an invalid sequence. Mark as DONE + // and return. + MarkAsDone(); + return false; + } + } else { + term_start_ = term_end_exclusive_; + } + + int next_utf16_index_exclusive = break_iterator_->Next(); + // Reached the end + if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) { + MarkAsDone(); + return false; + } + if (!term_end_exclusive_.AdvanceToUtf16(next_utf16_index_exclusive)) { + // next_utf16_index_exclusive is guaranteed to be within bonds thanks to + // the check for kDone above. So the only possible failure could be an + // invalid sequence. Mark as DONE and return. + MarkAsDone(); + return false; + } + + // Check if the current term is valid. We consider any term valid if its + // first character is valid. If it's not valid, then we need to advance to + // the next term. + if (term_start_.IsValidCharacter()) { + return true; + } + return Advance(); + } + + // Returns the current term. It can be called only when Advance() returns + // true. + std::string_view GetTerm() const override { + int term_length = + term_end_exclusive_.utf8_index() - term_start_.utf8_index(); + if (term_length > 0 && std::isspace(text_[term_start_.utf8_index()])) { + // Rule 3: multiple continuous whitespaces are treated as one. + term_length = 1; + } + return text_.substr(term_start_.utf8_index(), term_length); + } + + // Resets the iterator to point to the first term that starts after offset. + // GetTerm will now return that term. + // + // Returns: + // On success, the starting position of the first term that starts after + // offset. + // NOT_FOUND if an error occurred or there are no terms that start after + // offset. + // INVALID_ARGUMENT if offset is out of bounds for the provided text. + // ABORTED if an invalid unicode character is encountered while + // traversing the text. + libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter( + int32_t offset) override { + if (offset < 0 || offset >= text_.length()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Illegal offset provided! Offset %d is not within bounds of string " + "of length %zu", + offset, text_.length())); + } + if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) { + // We're done. Need to start from the beginning if we're going to reset + // properly. + term_start_ = CharacterIterator(text_); + term_end_exclusive_ = CharacterIterator(text_); + } + + // 1. Find the unicode character that contains the byte at offset. + CharacterIterator offset_iterator = term_end_exclusive_; + bool success = (offset > offset_iterator.utf8_index()) + ? offset_iterator.AdvanceToUtf8(offset) + : offset_iterator.RewindToUtf8(offset); + if (!success) { + // Offset is guaranteed to be within bounds thanks to the check above. So + // the only possible failure could be an invalid sequence. Mark as DONE + // and return. + MarkAsDone(); + return absl_ports::AbortedError("Encountered invalid UTF sequence!"); + } + + // 2. We've got the unicode character containing byte offset. Now, we need + // to point to the segment that starts after this character. + int following_utf16_index = + break_iterator_->Following(offset_iterator.utf16_index()); + if (following_utf16_index == ReverseJniBreakIterator::kDone) { + MarkAsDone(); + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "No segments begin after provided offset %d.", offset)); + } + if (!offset_iterator.AdvanceToUtf16(following_utf16_index)) { + // following_utf16_index is guaranteed to be within bonds thanks to the + // check for kDone above. So the only possible failure could be an invalid + // sequence. Mark as DONE and return. + MarkAsDone(); + return absl_ports::AbortedError("Encountered invalid UTF sequence!"); + } + term_end_exclusive_ = offset_iterator; + + // 3. The term_end_exclusive_ points to the term that we want to return. We + // need to Advance so that term_start_ will now point to this term. + if (!Advance()) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "No segments begin after provided offset %d.", offset)); + } + return term_start_.utf8_index(); + } + + // Resets the iterator to point to the first term that ends before offset. + // GetTerm will now return that term. + // + // Returns: + // On success, the starting position of the first term that ends before + // offset. + // NOT_FOUND if an error occurred or there are no terms that end before + // offset. + // INVALID_ARGUMENT if offset is out of bounds for the provided text. + // ABORTED if an invalid unicode character is encountered while + // traversing the text. + libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore( + int32_t offset) override { + if (offset < 0 || offset >= text_.length()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Illegal offset provided! Offset %d is not within bounds of string " + "of length %zu", + offset, text_.length())); + } + if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) { + // We're done. Need to start from the beginning if we're going to reset + // properly. + term_start_ = CharacterIterator(text_); + term_end_exclusive_ = CharacterIterator(text_); + } + + // 1. Find the unicode character that contains the byte at offset. + CharacterIterator offset_iterator = term_end_exclusive_; + bool success = (offset > offset_iterator.utf8_index()) + ? offset_iterator.AdvanceToUtf8(offset) + : offset_iterator.RewindToUtf8(offset); + if (!success) { + // Offset is guaranteed to be within bounds thanks to the check above. So + // the only possible failure could be an invalid sequence. Mark as DONE + // and return. + MarkAsDone(); + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); + } + + // 2. We've got the unicode character containing byte offset. Now, we need + // to point to the segment that starts before this character. + int starting_utf16_index = + break_iterator_->Preceding(offset_iterator.utf16_index()); + if (starting_utf16_index == ReverseJniBreakIterator::kDone) { + // Rewind the end indices. + MarkAsDone(); + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "No segments end before provided offset %d.", offset)); + } + if (!offset_iterator.RewindToUtf16(starting_utf16_index)) { + // starting_utf16_index is guaranteed to be within bonds thanks to the + // check for kDone above. So the only possible failure could be an invalid + // sequence. Mark as DONE and return. + MarkAsDone(); + return absl_ports::AbortedError("Encountered invalid UTF sequence!"); + } + term_start_ = offset_iterator; + + // 3. We've correctly set the start index and the iterator currently points + // to that position. Now we need to find the correct end position and + // advance the iterator to that position. + int end_utf16_index = break_iterator_->Next(); + term_end_exclusive_ = term_start_; + term_end_exclusive_.AdvanceToUtf16(end_utf16_index); + + // 4. The start and end indices point to a segment, but we need to ensure + // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll + // need a segment prior to this one. + if (term_end_exclusive_.utf8_index() > offset || + !term_start_.IsValidCharacter()) { + return ResetToTermEndingBefore(term_start_.utf8_index()); + } + return term_start_.utf8_index(); + } + + libtextclassifier3::StatusOr<int32_t> ResetToStart() override { + term_start_ = CharacterIterator(text_); + term_end_exclusive_ = CharacterIterator(text_); + if (!Advance()) { + return absl_ports::NotFoundError(""); + } + return term_start_.utf8_index(); + } + + private: + // Ensures that all members are consistent with the 'Done' state. + // In the 'Done' state, both term_start_.utf8_index() and + // term_end_exclusive_.utf8_index() will point to the same character, causing + // GetTerm() to return an empty string and term_start_.utf16_index() and + // term_end_exclusive_.utf16_index() will be marked with the kDone value. + // break_iterator_ may be in any state. + void MarkAsDone() { + term_start_ = + CharacterIterator(text_, /*utf8_index=*/0, + /*utf16_index=*/ReverseJniBreakIterator::kDone); + term_end_exclusive_ = + CharacterIterator(text_, /*utf8_index=*/0, + /*utf16_index=*/ReverseJniBreakIterator::kDone); + } + + // All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So + // this class needs to maintain state to convert between UTF-16 and UTF-8. + std::unique_ptr<ReverseJniBreakIterator> break_iterator_; + + // Text to be segmented + std::string_view text_; + + // Index used to track the start position of current term. + CharacterIterator term_start_; + + // Index used to track the end position of current term. + CharacterIterator term_end_exclusive_; +}; + +libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> +ReverseJniLanguageSegmenter::Segment(const std::string_view text) const { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<ReverseJniBreakIterator> break_iterator, + ReverseJniBreakIterator::Create(jni_cache_, text, locale_)); + return std::make_unique<ReverseJniLanguageSegmenterIterator>( + text, std::move(break_iterator)); +} + +libtextclassifier3::StatusOr<std::vector<std::string_view>> +ReverseJniLanguageSegmenter::GetAllTerms(const std::string_view text) const { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator, + Segment(text)); + std::vector<std::string_view> terms; + while (iterator->Advance()) { + terms.push_back(iterator->GetTerm()); + } + return terms; +} + +} // namespace lib +} // namespace icing diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h new file mode 100644 index 0000000..f06dac9 --- /dev/null +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h @@ -0,0 +1,51 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_ +#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_ + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> +#include <vector> + +#include "icing/jni/jni-cache.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/tokenization/language-segmenter.h" + +namespace icing { +namespace lib { + +class ReverseJniLanguageSegmenter : public LanguageSegmenter { + public: + ReverseJniLanguageSegmenter(std::string locale, const JniCache* jni_cache) + : locale_(std::move(locale)), jni_cache_(jni_cache) {} + + libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> + Segment(std::string_view text) const override; + + libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms( + std::string_view text) const override; + + private: + std::string locale_; + + const JniCache* jni_cache_; // does not own! +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_H_ diff --git a/icing/tokenization/simple/space-language-segmenter-factory.cc b/icing/tokenization/simple/space-language-segmenter-factory.cc new file mode 100644 index 0000000..1cca603 --- /dev/null +++ b/icing/tokenization/simple/space-language-segmenter-factory.cc @@ -0,0 +1,41 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/simple/space-language-segmenter.h" +#include "icing/util/logging.h" + +namespace icing { +namespace lib { + +namespace language_segmenter_factory { + +// Creates a language segmenter with the given locale. +// +// Returns: +// A LanguageSegmenter on success +// INVALID_ARGUMENT if locale string is invalid +// +// TODO(samzheng): Figure out if we want to verify locale strings and notify +// users. Right now illegal locale strings will be ignored by ICU. ICU +// components will be created with its default locale. +libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create( + SegmenterOptions) { + return std::make_unique<SpaceLanguageSegmenter>(); +} + +} // namespace language_segmenter_factory + +} // namespace lib +} // namespace icing diff --git a/icing/tokenization/space-language-segmenter.cc b/icing/tokenization/simple/space-language-segmenter.cc index 3d5c7cf..7e301ec 100644 --- a/icing/tokenization/space-language-segmenter.cc +++ b/icing/tokenization/simple/space-language-segmenter.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/tokenization/space-language-segmenter.h" +#include "icing/tokenization/simple/space-language-segmenter.h" #include <cstdint> #include <memory> @@ -40,7 +40,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator { : text_(text), term_start_index_(0), term_end_index_exclusive_(0) {} // Advances to the next term. Returns false if it has reached the end. - bool Advance() { + bool Advance() override { if (term_end_index_exclusive_ >= text_.size() || term_start_index_ >= text_.size()) { // Reached the end @@ -74,7 +74,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Returns the current term. It can be called only when Advance() returns // true. - std::string_view GetTerm() const { + std::string_view GetTerm() const override { if (text_[term_start_index_] == kASCIISpace) { // Rule: multiple continuous whitespaces are treated as one. return std::string_view(&text_[term_start_index_], 1); @@ -84,7 +84,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator { } libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter( - int32_t offset) { + int32_t offset) override { if (offset < 0) { // Start over from the beginning to find the first term. term_start_index_ = 0; @@ -111,7 +111,7 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator { } libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore( - int32_t offset) { + int32_t offset) override { if (offset <= 0 || offset > text_.size()) { return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( "No term found in '%s' that ends before offset %d", @@ -146,6 +146,15 @@ class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return term_start_index_; } + libtextclassifier3::StatusOr<int32_t> ResetToStart() override { + term_start_index_ = 0; + term_end_index_exclusive_ = 0; + if (!Advance()) { + return absl_ports::NotFoundError(""); + } + return term_start_index_; + } + private: // Return the start offset of the term starting right before the given offset. libtextclassifier3::StatusOr<int32_t> GetTermStartingBefore(int32_t offset) { diff --git a/icing/tokenization/space-language-segmenter.h b/icing/tokenization/simple/space-language-segmenter.h index 73f8f30..de0a6d3 100644 --- a/icing/tokenization/space-language-segmenter.h +++ b/icing/tokenization/simple/space-language-segmenter.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_ -#define ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_ +#ifndef ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_ +#define ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_ #include <cstdint> #include <memory> @@ -55,4 +55,4 @@ class SpaceLanguageSegmenter : public LanguageSegmenter { } // namespace lib } // namespace icing -#endif // ICING_TOKENIZATION_SPACE_LANGUAGE_SEGMENTER_H_ +#endif // ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_ diff --git a/icing/tokenization/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc index ef6f54f..8ed38b2 100644 --- a/icing/tokenization/space-language-segmenter_test.cc +++ b/icing/tokenization/simple/space-language-segmenter_test.cc @@ -28,24 +28,21 @@ using ::testing::Eq; using ::testing::IsEmpty; TEST(SpaceLanguageSegmenterTest, EmptyText) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::SPACE)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty())); } TEST(SpaceLanguageSegmenterTest, SimpleText) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::SPACE)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"), IsOkAndHolds(ElementsAre("Hello", " ", "World"))); } TEST(SpaceLanguageSegmenterTest, Punctuation) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::SPACE)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"), IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!"))); @@ -58,9 +55,8 @@ TEST(SpaceLanguageSegmenterTest, Punctuation) { } TEST(SpaceLanguageSegmenterTest, Alphanumeric) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::SPACE)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); // Alphanumeric terms are allowed EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"), @@ -68,9 +64,8 @@ TEST(SpaceLanguageSegmenterTest, Alphanumeric) { } TEST(SpaceLanguageSegmenterTest, Number) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::SPACE)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); // Alphanumeric terms are allowed EXPECT_THAT( @@ -85,9 +80,8 @@ TEST(SpaceLanguageSegmenterTest, Number) { } TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::SPACE)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); // Multiple continuous whitespaces are treated as one. const int kNumSeparators = 256; @@ -98,9 +92,8 @@ TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) { } TEST(SpaceLanguageSegmenterTest, NotCopyStrings) { - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(language_segmenter_factory::SPACE)); + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create()); // Validates that the input strings are not copied const std::string text = "Hello World"; const char* word1_address = text.c_str(); diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h index 3ad61fb..38c4745 100644 --- a/icing/tokenization/tokenizer.h +++ b/icing/tokenization/tokenizer.h @@ -85,6 +85,8 @@ class Tokenizer { // // "foo". // PrintToken(iterator.GetToken()); // prints "foo" virtual bool ResetToTokenBefore(int32_t offset) { return false; } + + virtual bool ResetToStart() { return false; } }; // Tokenizes the input text. The input text should outlive the returned diff --git a/icing/transform/icu-normalizer_test.cc b/icing/transform/icu-normalizer_test.cc deleted file mode 100644 index 5e822d2..0000000 --- a/icing/transform/icu-normalizer_test.cc +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <memory> - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/icu-data-file-helper.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/icu-i18n-test-utils.h" -#include "icing/testing/test-data.h" -#include "icing/transform/normalizer-factory.h" -#include "icing/transform/normalizer.h" - -namespace icing { -namespace lib { -namespace { -using ::testing::Eq; - -class IcuNormalizerTest : public testing::Test { - protected: - void SetUp() override { - ICING_ASSERT_OK( - // File generated via icu_data_file rule in //icing/BUILD. - icu_data_file_helper::SetUpICUDataFile( - GetTestFilePath("icing/icu.dat"))); - - ICING_ASSERT_OK_AND_ASSIGN( - normalizer_, - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/1024)); - } - - std::unique_ptr<Normalizer> normalizer_; -}; - -TEST_F(IcuNormalizerTest, Creation) { - EXPECT_THAT( - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/5), - IsOk()); - EXPECT_THAT( - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/0), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT( - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/-1), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -// Strings that are already normalized won't change if normalized again. -TEST_F(IcuNormalizerTest, AlreadyNormalized) { - EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq("")); - EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world")); - EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好")); - EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス")); - EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요")); -} - -TEST_F(IcuNormalizerTest, UppercaseToLowercase) { - EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi")); - EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing")); -} - -TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) { - EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich")); - EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi")); - EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias")); - EXPECT_THAT(normalizer_->NormalizeTerm("āăąḃḅḇčćç"), Eq("aaabbbccc")); - EXPECT_THAT(normalizer_->NormalizeTerm("ÁȦÄḂḄḆĆČḈ"), Eq("aaabbbccc")); -} - -// Accent / diacritic marks won't be removed in non-latin chars, e.g. in -// Japanese and Greek -TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) { - EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド")); - EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα")); -} - -TEST_F(IcuNormalizerTest, FullWidthCharsToASCII) { - // Full-width punctuation to ASCII punctuation - EXPECT_THAT(normalizer_->NormalizeTerm("。,!?:”"), Eq(".,!?:\"")); - // 0xff10 is the full-width number 0 - EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff10)), Eq("0")); - // 0xff21 is the full-width letter A - EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff21)), Eq("a")); - // 0xff41 is the full-width letter a - EXPECT_THAT(normalizer_->NormalizeTerm(UCharToString(0xff41)), Eq("a")); -} - -// For Katakana, each character is normalized to its full-width version. -TEST_F(IcuNormalizerTest, KatakanaHalfWidthToFullWidth) { - EXPECT_THAT(normalizer_->NormalizeTerm("カ"), Eq("カ")); - EXPECT_THAT(normalizer_->NormalizeTerm("ォ"), Eq("ォ")); - EXPECT_THAT(normalizer_->NormalizeTerm("サ"), Eq("サ")); - EXPECT_THAT(normalizer_->NormalizeTerm("ホ"), Eq("ホ")); -} - -TEST_F(IcuNormalizerTest, HiraganaToKatakana) { - EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ")); - EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ")); - EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ")); - EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ")); - EXPECT_THAT(normalizer_->NormalizeTerm("ぎゃぎゅぎょ"), Eq("ギャギュギョ")); -} - -TEST_F(IcuNormalizerTest, SuperscriptAndSubscriptToASCII) { - EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9")); - EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9")); -} - -TEST_F(IcuNormalizerTest, CircledCharsToASCII) { - EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1")); - EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a")); -} - -TEST_F(IcuNormalizerTest, RotatedCharsToASCII) { - EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{")); - EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}")); -} - -TEST_F(IcuNormalizerTest, SquaredCharsToASCII) { - EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート")); -} - -TEST_F(IcuNormalizerTest, FractionsToASCII) { - EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4")); - EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6")); -} - -TEST_F(IcuNormalizerTest, Truncate) { - { - ICING_ASSERT_OK_AND_ASSIGN( - auto normalizer, - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/5)); - - // Won't be truncated - EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi")); - EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello")); - - // Truncated to length 5. - EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello")); - - // Each Japanese character has 3 bytes, so truncating to length 5 results in - // only 1 character. - EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ")); - - // Each Greek character has 2 bytes, so truncating to length 5 results in 2 - // character. - EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ")); - } - - { - ICING_ASSERT_OK_AND_ASSIGN( - auto normalizer, - normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, - /*max_term_byte_size=*/2)); - // The Japanese character has 3 bytes, truncating it results in an empty - // string. - EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq("")); - } -} - -} // namespace -} // namespace lib -} // namespace icing diff --git a/icing/transform/icu/icu-normalizer-factory.cc b/icing/transform/icu/icu-normalizer-factory.cc new file mode 100644 index 0000000..493aeb5 --- /dev/null +++ b/icing/transform/icu/icu-normalizer-factory.cc @@ -0,0 +1,52 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_ +#define ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_ + +#include <memory> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/transform/icu/icu-normalizer.h" +#include "icing/transform/normalizer.h" + +namespace icing { +namespace lib { + +namespace normalizer_factory { + +// Creates an ICU-based normalizer. max_term_byte_size enforces the max size of +// text after normalization, text will be truncated if exceeds the max size. +// +// Returns: +// A normalizer on success +// INVALID_ARGUMENT if max_term_byte_size <= 0 +// INTERNAL_ERROR on errors +libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( + int max_term_byte_size) { + if (max_term_byte_size <= 0) { + return absl_ports::InvalidArgumentError( + "max_term_byte_size must be greater than zero."); + } + return IcuNormalizer::Create(max_term_byte_size); +} + +} // namespace normalizer_factory + +} // namespace lib +} // namespace icing + +#endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_FACTORY_H_ diff --git a/icing/transform/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc index c7cfd99..0bb8326 100644 --- a/icing/transform/icu-normalizer.cc +++ b/icing/transform/icu/icu-normalizer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/transform/icu-normalizer.h" +#include "icing/transform/icu/icu-normalizer.h" #include <cctype> #include <memory> @@ -24,7 +24,7 @@ #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/transform/normalizer.h" -#include "icing/util/icu-i18n-utils.h" +#include "icing/util/i18n-utils.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" #include "unicode/umachine.h" @@ -55,6 +55,46 @@ constexpr UChar kTransformRulesUtf16[] = constexpr int kTransformRulesLength = sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1; +// Transforms a Unicode character with diacritics to its counterpart in ASCII +// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if +// the transformation is successful. +// +// NOTE: According to our convention this function should have returned +// StatusOr<char>. However, this function is performance-sensitive because is +// could be called on every Latin character in normalization, so we make it +// return a bool here to save a bit more time and memory. +bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in, + char* char_out) { + if (i18n_utils::IsAscii(uchar32_in)) { + // The Unicode character is within ASCII range + if (char_out != nullptr) { + *char_out = uchar32_in; + } + return true; + } + + // Maximum number of pieces a Unicode character can be decomposed into. + // TODO(samzheng) figure out if this number is proper. + constexpr int kDecompositionBufferCapacity = 5; + + // A buffer used to store Unicode decomposition mappings of only one + // character. + UChar decomposition_buffer[kDecompositionBufferCapacity]; + + // Decomposes the Unicode character, trying to get an ASCII char and some + // diacritic chars. + UErrorCode status = U_ZERO_ERROR; + if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0], + kDecompositionBufferCapacity, &status) > 0 && + !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) { + if (char_out != nullptr) { + *char_out = decomposition_buffer[0]; + } + return true; + } + return false; +} + } // namespace // Creates a IcuNormalizer with a valid TermTransformer instance. @@ -96,11 +136,9 @@ std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const { // into an ASCII char. Since the term is tokenized, we know that the whole // term can be transformed into ASCII if the first character can. UChar32 first_uchar32 = - icu_i18n_utils::GetUChar32At(term.data(), term.length(), 0); - if (normalizer2 != nullptr && - first_uchar32 != icu_i18n_utils::kInvalidUChar32 && - icu_i18n_utils::DiacriticCharToAscii(normalizer2, first_uchar32, - nullptr)) { + i18n_utils::GetUChar32At(term.data(), term.length(), 0); + if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 && + DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) { // This is a faster method to normalize Latin terms. normalized_text = NormalizeLatin(normalizer2, term); } else { @@ -108,7 +146,7 @@ std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const { } if (normalized_text.length() > max_term_byte_size_) { - icu_i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_); + i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_); } return normalized_text; @@ -119,19 +157,17 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2, std::string result; result.reserve(term.length()); for (int i = 0; i < term.length(); i++) { - if (icu_i18n_utils::IsAscii(term[i])) { + if (i18n_utils::IsAscii(term[i])) { result.push_back(std::tolower(term[i])); - } else if (icu_i18n_utils::IsLeadUtf8Byte(term[i])) { - UChar32 uchar32 = - icu_i18n_utils::GetUChar32At(term.data(), term.length(), i); - if (uchar32 == icu_i18n_utils::kInvalidUChar32) { + } else if (i18n_utils::IsLeadUtf8Byte(term[i])) { + UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i); + if (uchar32 == i18n_utils::kInvalidUChar32) { ICING_LOG(WARNING) << "Unable to get uchar32 from " << term << " at position" << i; continue; } char ascii_char; - if (icu_i18n_utils::DiacriticCharToAscii(normalizer2, uchar32, - &ascii_char)) { + if (DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) { result.push_back(std::tolower(ascii_char)); } else { // We don't know how to transform / decompose this Unicode character, it @@ -139,7 +175,7 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2, // Latin characters. This shouldn't happen if input term is properly // tokenized. We handle it here in case there're something wrong with // the tokenizers. - int utf8_length = icu_i18n_utils::GetUtf8Length(uchar32); + int utf8_length = i18n_utils::GetUtf8Length(uchar32); absl_ports::StrAppend(&result, term.substr(i, utf8_length)); } } @@ -175,7 +211,7 @@ IcuNormalizer::TermTransformer::~TermTransformer() { std::string IcuNormalizer::TermTransformer::Transform( const std::string_view term) const { - auto utf16_term_or = icu_i18n_utils::Utf8ToUtf16(term); + auto utf16_term_or = i18n_utils::Utf8ToUtf16(term); if (!utf16_term_or.ok()) { ICING_VLOG(0) << "Failed to convert UTF8 term '" << term << "' to UTF16"; return std::string(term); @@ -216,7 +252,7 @@ std::string IcuNormalizer::TermTransformer::Transform( return std::string(term); } - auto utf8_term_or = icu_i18n_utils::Utf16ToUtf8(utf16_term); + auto utf8_term_or = i18n_utils::Utf16ToUtf8(utf16_term); if (!utf8_term_or.ok()) { ICING_VLOG(0) << "Failed to convert UTF16 term '" << term << "' to UTF8"; return std::string(term); diff --git a/icing/transform/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h index 86d4a64..f20a9fb 100644 --- a/icing/transform/icu-normalizer.h +++ b/icing/transform/icu/icu-normalizer.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_TRANSFORM_ICU_NORMALIZER_H_ -#define ICING_TRANSFORM_ICU_NORMALIZER_H_ +#ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ +#define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ #include <memory> #include <string> @@ -102,4 +102,4 @@ class IcuNormalizer : public Normalizer { } // namespace lib } // namespace icing -#endif // ICING_TRANSFORM_ICU_NORMALIZER_H_ +#endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ diff --git a/icing/transform/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc index 2fce32b..b037538 100644 --- a/icing/transform/icu-normalizer_benchmark.cc +++ b/icing/transform/icu/icu-normalizer_benchmark.cc @@ -14,7 +14,7 @@ #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" -#include "icing/icu-data-file-helper.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/test-data.h" #include "icing/transform/normalizer-factory.h" @@ -22,9 +22,9 @@ // Run on a Linux workstation: // $ blaze build -c opt --dynamic_mode=off --copt=-gmlt -// //icing/transform:icu-normalizer_benchmark +// //icing/transform/icu:icu-normalizer_benchmark // -// $ blaze-bin/icing/transform/icu-normalizer_benchmark +// $ blaze-bin/icing/transform/icu/icu-normalizer_benchmark // --benchmarks=all // // Run on an Android device: @@ -33,9 +33,10 @@ // // $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" // --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt -// //icing/transform:icu-normalizer_benchmark +// //icing/transform/icu:icu-normalizer_benchmark // -// $ adb push blaze-bin/icing/transform/icu-normalizer_benchmark +// $ adb push +// blaze-bin/icing/transform/icu/icu-normalizer_benchmark // /data/local/tmp/ // // $ adb shell /data/local/tmp/icu-normalizer_benchmark --benchmarks=all @@ -60,7 +61,7 @@ void BM_NormalizeUppercase(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - normalizer_factory::NormalizerType::ICU4C, + /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string(state.range(0), 'A'); @@ -94,7 +95,7 @@ void BM_NormalizeAccent(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - normalizer_factory::NormalizerType::ICU4C, + /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; @@ -132,7 +133,7 @@ void BM_NormalizeHiragana(benchmark::State& state) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Normalizer> normalizer, normalizer_factory::Create( - normalizer_factory::NormalizerType::ICU4C, + /*max_term_byte_size=*/std::numeric_limits<int>::max())); std::string input_string; diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc new file mode 100644 index 0000000..83fa972 --- /dev/null +++ b/icing/transform/icu/icu-normalizer_test.cc @@ -0,0 +1,237 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/helpers/icu/icu-data-file-helper.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/icu-i18n-test-utils.h" +#include "icing/testing/test-data.h" +#include "icing/transform/normalizer-factory.h" +#include "icing/transform/normalizer.h" + +namespace icing { +namespace lib { +namespace { +using ::testing::Eq; + +class IcuNormalizerTest : public testing::Test { + protected: + void SetUp() override { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + + ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( + /*max_term_byte_size=*/1024)); + } + + std::unique_ptr<Normalizer> normalizer_; +}; + +TEST_F(IcuNormalizerTest, Creation) { + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/5), + IsOk()); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/0), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +// Strings that are already normalized won't change if normalized again. +TEST_F(IcuNormalizerTest, AlreadyNormalized) { + EXPECT_THAT(normalizer_->NormalizeTerm(""), Eq("")); + EXPECT_THAT(normalizer_->NormalizeTerm("hello world"), Eq("hello world")); + EXPECT_THAT(normalizer_->NormalizeTerm("你好"), Eq("你好")); + EXPECT_THAT(normalizer_->NormalizeTerm("キャンパス"), Eq("キャンパス")); + EXPECT_THAT(normalizer_->NormalizeTerm("안녕하세요"), Eq("안녕하세요")); +} + +TEST_F(IcuNormalizerTest, UppercaseToLowercase) { + EXPECT_THAT(normalizer_->NormalizeTerm("MDI"), Eq("mdi")); + EXPECT_THAT(normalizer_->NormalizeTerm("Icing"), Eq("icing")); +} + +TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) { + EXPECT_THAT(normalizer_->NormalizeTerm("Zürich"), Eq("zurich")); + EXPECT_THAT(normalizer_->NormalizeTerm("après-midi"), Eq("apres-midi")); + EXPECT_THAT(normalizer_->NormalizeTerm("Buenos días"), Eq("buenos dias")); + EXPECT_THAT(normalizer_->NormalizeTerm("ÀÁÂÃÄÅĀĂĄḀḁàáâãäåāăą"), + Eq("aaaaaaaaaaaaaaaaaaaa")); + EXPECT_THAT(normalizer_->NormalizeTerm("ḂḄḆḃḅḇ"), Eq("bbbbbb")); + EXPECT_THAT(normalizer_->NormalizeTerm("ÇĆĈĊČḈḉćĉċčç"), Eq("cccccccccccc")); + EXPECT_THAT(normalizer_->NormalizeTerm("ÐĎĐḊḌḎḐḒḋḍḏḑḓďđ"), + Eq("ddddddddddddddd")); + EXPECT_THAT(normalizer_->NormalizeTerm("ÈÉÊËĒĔĖĘḔḖḘḚḜḕḗḙḛḝèéêëēĕėęě"), + Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee")); + EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), + Eq("hhhhhhhhhhhhh")); + EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"), + Eq("iiiiiiiiiiiiiiiii")); + EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk")); + EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), + Eq("lllllllllllll")); + EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm")); + EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"), + Eq("nnnnnnnnnnnnnnnn")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŌŎŐÒÓÔÕÖṌṎṐṒṍṏṑṓòóôõöōŏő"), + Eq("oooooooooooooooooooooooo")); + EXPECT_THAT(normalizer_->NormalizeTerm("ṔṖṕṗ"), Eq("pppp")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŔŖŘṘṚṜṞṙṛṝṟŕŗř"), + Eq("rrrrrrrrrrrrrr")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŚŜŞŠȘṠṢṤṦṨṡṣṥṧṩșśŝşš"), + Eq("ssssssssssssssssssss")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŢŤȚṪṬṮṰṫṭṯṱțţť"), + Eq("tttttttttttttt")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŨŪŬÙÚÛÜṲṴṶṸṺṳṵṷṹṻùúûüũūŭ"), + Eq("uuuuuuuuuuuuuuuuuuuuuuuu")); + EXPECT_THAT(normalizer_->NormalizeTerm("ṼṾṽṿ"), Eq("vvvv")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww")); + EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx")); + EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy")); + EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), + Eq("zzzzzzzzzzzz")); +} + +// Accent / diacritic marks won't be removed in non-latin chars, e.g. in +// Japanese and Greek +TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) { + // Katakana + EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド")); + // Greek + EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα")); + EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή")); + + // Our current ICU rules can't handle Hebrew properly, e.g. the accents in + // "אָלֶף־בֵּית עִבְרִי" + // will be removed. + // TODO (samzheng): figure out how we should handle Hebrew. +} + +TEST_F(IcuNormalizerTest, FullWidthCharsToASCII) { + // Full-width punctuation to ASCII punctuation + EXPECT_THAT(normalizer_->NormalizeTerm("‘’.,!?:“”"), Eq("''.,!?:\"\"")); + // Full-width 0-9 + EXPECT_THAT(normalizer_->NormalizeTerm("0123456789"), + Eq("0123456789")); + // Full-width A-Z + EXPECT_THAT(normalizer_->NormalizeTerm( + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), + Eq("abcdefghijklmnopqrstuvwxyz")); + // Full-width a-z + EXPECT_THAT(normalizer_->NormalizeTerm( + "abcdefghijklmnopqrstuvwxyz"), + Eq("abcdefghijklmnopqrstuvwxyz")); +} + +TEST_F(IcuNormalizerTest, IdeographicToASCII) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + EXPECT_THAT(normalizer->NormalizeTerm(",。"), Eq(",.")); +} + +// For Katakana, each character is normalized to its full-width version. +TEST_F(IcuNormalizerTest, KatakanaHalfWidthToFullWidth) { + EXPECT_THAT(normalizer_->NormalizeTerm("カ"), Eq("カ")); + EXPECT_THAT(normalizer_->NormalizeTerm("ォ"), Eq("ォ")); + EXPECT_THAT(normalizer_->NormalizeTerm("サ"), Eq("サ")); + EXPECT_THAT(normalizer_->NormalizeTerm("ホ"), Eq("ホ")); +} + +TEST_F(IcuNormalizerTest, HiraganaToKatakana) { + EXPECT_THAT(normalizer_->NormalizeTerm("あいうえお"), Eq("アイウエオ")); + EXPECT_THAT(normalizer_->NormalizeTerm("かきくけこ"), Eq("カキクケコ")); + EXPECT_THAT(normalizer_->NormalizeTerm("さしすせそ"), Eq("サシスセソ")); + EXPECT_THAT(normalizer_->NormalizeTerm("たちつてと"), Eq("タチツテト")); + EXPECT_THAT(normalizer_->NormalizeTerm("なにぬねの"), Eq("ナニヌネノ")); + EXPECT_THAT(normalizer_->NormalizeTerm("はひふへほ"), Eq("ハヒフヘホ")); + EXPECT_THAT(normalizer_->NormalizeTerm("まみむめも"), Eq("マミムメモ")); + EXPECT_THAT(normalizer_->NormalizeTerm("やゆよ"), Eq("ヤユヨ")); + EXPECT_THAT(normalizer_->NormalizeTerm("らりるれろ"), Eq("ラリルレロ")); + EXPECT_THAT(normalizer_->NormalizeTerm("わゐゑを"), Eq("ワヰヱヲ")); + EXPECT_THAT(normalizer_->NormalizeTerm("ん"), Eq("ン")); + EXPECT_THAT(normalizer_->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ")); + EXPECT_THAT(normalizer_->NormalizeTerm("ざじずぜぞ"), Eq("ザジズゼゾ")); + EXPECT_THAT(normalizer_->NormalizeTerm("だぢづでど"), Eq("ダヂヅデド")); + EXPECT_THAT(normalizer_->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ")); + EXPECT_THAT(normalizer_->NormalizeTerm("ぱぴぷぺぽ"), Eq("パピプペポ")); +} + +TEST_F(IcuNormalizerTest, SuperscriptAndSubscriptToASCII) { + EXPECT_THAT(normalizer_->NormalizeTerm("⁹"), Eq("9")); + EXPECT_THAT(normalizer_->NormalizeTerm("₉"), Eq("9")); +} + +TEST_F(IcuNormalizerTest, CircledCharsToASCII) { + EXPECT_THAT(normalizer_->NormalizeTerm("①"), Eq("1")); + EXPECT_THAT(normalizer_->NormalizeTerm("Ⓐ"), Eq("a")); +} + +TEST_F(IcuNormalizerTest, RotatedCharsToASCII) { + EXPECT_THAT(normalizer_->NormalizeTerm("︷"), Eq("{")); + EXPECT_THAT(normalizer_->NormalizeTerm("︸"), Eq("}")); +} + +TEST_F(IcuNormalizerTest, SquaredCharsToASCII) { + EXPECT_THAT(normalizer_->NormalizeTerm("㌀"), Eq("アパート")); +} + +TEST_F(IcuNormalizerTest, FractionsToASCII) { + EXPECT_THAT(normalizer_->NormalizeTerm("¼"), Eq(" 1/4")); + EXPECT_THAT(normalizer_->NormalizeTerm("⅚"), Eq(" 5/6")); +} + +TEST_F(IcuNormalizerTest, Truncate) { + { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/5)); + + // Won't be truncated + EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi")); + EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello")); + + // Truncated to length 5. + EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello")); + + // Each Japanese character has 3 bytes, so truncating to length 5 results in + // only 1 character. + EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ")); + + // Each Greek character has 2 bytes, so truncating to length 5 results in 2 + // character. + EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ")); + } + + { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/2)); + // The Japanese character has 3 bytes, truncating it results in an empty + // string. + EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq("")); + } +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/transform/map/map-normalizer-factory.cc b/icing/transform/map/map-normalizer-factory.cc new file mode 100644 index 0000000..3bf84b3 --- /dev/null +++ b/icing/transform/map/map-normalizer-factory.cc @@ -0,0 +1,48 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/transform/map/map-normalizer.h" +#include "icing/transform/normalizer.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace normalizer_factory { + +// Creates a map-based normalizer. max_term_byte_size enforces the max size of +// text after normalization, text will be truncated if exceeds the max size. +// +// Returns: +// A normalizer on success +// INVALID_ARGUMENT if max_term_byte_size <= 0 +// INTERNAL_ERROR on errors +libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( + int max_term_byte_size) { + if (max_term_byte_size <= 0) { + return absl_ports::InvalidArgumentError( + "max_term_byte_size must be greater than zero."); + } + + return std::make_unique<MapNormalizer>(max_term_byte_size); +} + +} // namespace normalizer_factory + +} // namespace lib +} // namespace icing diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc new file mode 100644 index 0000000..c888551 --- /dev/null +++ b/icing/transform/map/map-normalizer.cc @@ -0,0 +1,86 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/transform/map/map-normalizer.h" + +#include <ctype.h> + +#include <string> +#include <string_view> +#include <unordered_map> +#include <utility> + +#include "icing/absl_ports/str_cat.h" +#include "icing/transform/map/normalization-map.h" +#include "icing/util/i18n-utils.h" +#include "icing/util/logging.h" +#include "unicode/utypes.h" + +namespace icing { +namespace lib { + +std::string MapNormalizer::NormalizeTerm(std::string_view term) const { + std::string normalized_text; + normalized_text.reserve(term.length()); + + for (int i = 0; i < term.length(); ++i) { + if (i18n_utils::IsAscii(term[i])) { + // The original character has 1 byte. + normalized_text.push_back(std::tolower(term[i])); + } else if (i18n_utils::IsLeadUtf8Byte(term[i])) { + UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i); + if (uchar32 == i18n_utils::kInvalidUChar32) { + ICING_LOG(WARNING) << "Unable to get uchar32 from " << term + << " at position" << i; + continue; + } + int utf8_length = i18n_utils::GetUtf8Length(uchar32); + if (i18n_utils::GetUtf16Length(uchar32) > 1) { + // All the characters we need to normalize can be encoded into a + // single char16_t. If this character needs more than 1 char16_t code + // unit, we can skip normalization and append it directly. + absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length)); + continue; + } + // The original character can be encoded into a single char16_t. + const std::unordered_map<char16_t, char16_t>& normalization_map = + GetNormalizationMap(); + auto iterator = normalization_map.find(static_cast<char16_t>(uchar32)); + if (iterator != normalization_map.end()) { + // Found a normalization mapping. The normalized character (stored in a + // char16_t) can have 1 or 2 bytes. + if (i18n_utils::IsAscii(iterator->second)) { + // The normalized character has 1 byte. + normalized_text.push_back( + std::tolower(static_cast<char>(iterator->second))); + } else { + // The normalized character has 2 bytes. + i18n_utils::AppendUchar32ToUtf8(&normalized_text, iterator->second); + } + } else { + // Normalization mapping not found, append the original character. + absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length)); + } + } + } + + if (normalized_text.length() > max_term_byte_size_) { + i18n_utils::SafeTruncateUtf8(&normalized_text, max_term_byte_size_); + } + + return normalized_text; +} + +} // namespace lib +} // namespace icing diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h new file mode 100644 index 0000000..f9c0e42 --- /dev/null +++ b/icing/transform/map/map-normalizer.h @@ -0,0 +1,50 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_ +#define ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_ + +#include <string> +#include <string_view> + +#include "icing/transform/normalizer.h" + +namespace icing { +namespace lib { + +class MapNormalizer : public Normalizer { + public: + explicit MapNormalizer(int max_term_byte_size) + : max_term_byte_size_(max_term_byte_size){}; + + // Normalizes the input term based on character mappings. The mappings + // contain the following categories: + // - Uppercase -> lowercase + // - Hiragana -> Katakana + // - Common full-width characters -> ASCII + // - Common ideographic punctuation marks -> ASCII + // - Common diacritic Latin characters -> ASCII + // + // Read more mapping details in normalization-map.cc + std::string NormalizeTerm(std::string_view term) const override; + + private: + // The maximum term length allowed after normalization. + int max_term_byte_size_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_TRANSFORM_MAP_MAP_NORMALIZER_H_ diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc new file mode 100644 index 0000000..691afc6 --- /dev/null +++ b/icing/transform/map/map-normalizer_benchmark.cc @@ -0,0 +1,149 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> + +#include "testing/base/public/benchmark.h" +#include "icing/testing/common-matchers.h" +#include "icing/transform/normalizer-factory.h" +#include "icing/transform/normalizer.h" + +// Run on a Linux workstation: +// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt +// //icing/transform/map:map-normalizer_benchmark +// +// $ blaze-bin/icing/transform/map/map-normalizer_benchmark +// --benchmarks=all +// +// Run on an Android device: +// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" +// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt +// //icing/transform/map:map-normalizer_benchmark +// +// $ adb push +// blaze-bin/icing/transform/map/map-normalizer_benchmark +// /data/local/tmp/ +// +// $ adb shell /data/local/tmp/map-normalizer_benchmark --benchmarks=all +namespace icing { +namespace lib { + +namespace { + +void BM_NormalizeUppercase(benchmark::State& state) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string(state.range(0), 'A'); + + // Warms up. map-normalizer may need to load a static map when being invoked + // the first time. It takes about 0.05ms on a Pixel3 XL. + normalizer->NormalizeTerm(input_string); + + for (auto _ : state) { + normalizer->NormalizeTerm(input_string); + } +} +BENCHMARK(BM_NormalizeUppercase) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +void BM_NormalizeAccent(benchmark::State& state) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string; + while (input_string.length() < state.range(0)) { + input_string.append("àáâãā"); + } + + // Warms up. map-normalizer may need to load a static map when being invoked + // the first time. It takes about 0.05ms on a Pixel3 XL. + normalizer->NormalizeTerm(input_string); + + for (auto _ : state) { + normalizer->NormalizeTerm(input_string); + } +} +BENCHMARK(BM_NormalizeAccent) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +void BM_NormalizeHiragana(benchmark::State& state) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string; + while (input_string.length() < state.range(0)) { + input_string.append("あいうえお"); + } + + // Warms up. map-normalizer may need to load a static map when being invoked + // the first time. It takes about 0.05ms on a Pixel3 XL. + normalizer->NormalizeTerm(input_string); + + for (auto _ : state) { + normalizer->NormalizeTerm(input_string); + } +} +BENCHMARK(BM_NormalizeHiragana) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc new file mode 100644 index 0000000..b62ae0e --- /dev/null +++ b/icing/transform/map/map-normalizer_test.cc @@ -0,0 +1,205 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> +#include <string> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/icu-i18n-test-utils.h" +#include "icing/transform/normalizer-factory.h" +#include "icing/transform/normalizer.h" + +namespace icing { +namespace lib { + +namespace { +using ::testing::Eq; + +TEST(MapNormalizerTest, Creation) { + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/5), + IsOk()); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/0), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +// Strings that are already normalized won't change if normalized again. +TEST(MapNormalizerTest, AlreadyNormalized) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + EXPECT_THAT(normalizer->NormalizeTerm(""), Eq("")); + EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world")); + EXPECT_THAT(normalizer->NormalizeTerm("你好"), Eq("你好")); + EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キャンパス")); + EXPECT_THAT(normalizer->NormalizeTerm("안녕하세요"), Eq("안녕하세요")); +} + +TEST(MapNormalizerTest, UppercaseToLowercase) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("mdi")); + EXPECT_THAT(normalizer->NormalizeTerm("Icing"), Eq("icing")); +} + +TEST(MapNormalizerTest, LatinLetterRemoveAccent) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("zurich")); + EXPECT_THAT(normalizer->NormalizeTerm("après-midi"), Eq("apres-midi")); + EXPECT_THAT(normalizer->NormalizeTerm("Buenos días"), Eq("buenos dias")); + EXPECT_THAT(normalizer->NormalizeTerm("ÀÁÂÃÄÅĀĂĄḀḁàáâãäåāăą"), + Eq("aaaaaaaaaaaaaaaaaaaa")); + EXPECT_THAT(normalizer->NormalizeTerm("ḂḄḆḃḅḇ"), Eq("bbbbbb")); + EXPECT_THAT(normalizer->NormalizeTerm("ÇĆĈĊČḈḉćĉċčç"), Eq("cccccccccccc")); + EXPECT_THAT(normalizer->NormalizeTerm("ÐĎĐḊḌḎḐḒḋḍḏḑḓďđ"), + Eq("ddddddddddddddd")); + EXPECT_THAT(normalizer->NormalizeTerm("ÈÉÊËĒĔĖĘḔḖḘḚḜḕḗḙḛḝèéêëēĕėęě"), + Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee")); + EXPECT_THAT(normalizer->NormalizeTerm("Ḟḟ"), Eq("ff")); + EXPECT_THAT(normalizer->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg")); + EXPECT_THAT(normalizer->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh")); + EXPECT_THAT(normalizer->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"), + Eq("iiiiiiiiiiiiiiiii")); + EXPECT_THAT(normalizer->NormalizeTerm("Ĵĵ"), Eq("jj")); + EXPECT_THAT(normalizer->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk")); + EXPECT_THAT(normalizer->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll")); + EXPECT_THAT(normalizer->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm")); + EXPECT_THAT(normalizer->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"), + Eq("nnnnnnnnnnnnnnnn")); + EXPECT_THAT(normalizer->NormalizeTerm("ŌŎŐÒÓÔÕÖṌṎṐṒṍṏṑṓòóôõöōŏő"), + Eq("oooooooooooooooooooooooo")); + EXPECT_THAT(normalizer->NormalizeTerm("ṔṖṕṗ"), Eq("pppp")); + EXPECT_THAT(normalizer->NormalizeTerm("ŔŖŘṘṚṜṞṙṛṝṟŕŗř"), + Eq("rrrrrrrrrrrrrr")); + EXPECT_THAT(normalizer->NormalizeTerm("ŚŜŞŠȘṠṢṤṦṨṡṣṥṧṩșśŝşš"), + Eq("ssssssssssssssssssss")); + EXPECT_THAT(normalizer->NormalizeTerm("ŢŤȚṪṬṮṰṫṭṯṱțţť"), + Eq("tttttttttttttt")); + EXPECT_THAT(normalizer->NormalizeTerm("ŨŪŬÙÚÛÜṲṴṶṸṺṳṵṷṹṻùúûüũūŭ"), + Eq("uuuuuuuuuuuuuuuuuuuuuuuu")); + EXPECT_THAT(normalizer->NormalizeTerm("ṼṾṽṿ"), Eq("vvvv")); + EXPECT_THAT(normalizer->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww")); + EXPECT_THAT(normalizer->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx")); + EXPECT_THAT(normalizer->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy")); + EXPECT_THAT(normalizer->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz")); +} + +// Accent / diacritic marks won't be removed in non-latin chars, e.g. in +// Japanese and Greek +TEST(MapNormalizerTest, NonLatinLetterNotRemoveAccent) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + // Katakana + EXPECT_THAT(normalizer->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド")); + // Greek + EXPECT_THAT(normalizer->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα")); + EXPECT_THAT(normalizer->NormalizeTerm("εγγραφή"), Eq("εγγραφή")); + // Hebrew + EXPECT_THAT(normalizer->NormalizeTerm("אָלֶף־בֵּית עִבְרִי"), Eq("אָלֶף־בֵּית עִבְרִי")); +} + +TEST(MapNormalizerTest, FullWidthCharsToASCII) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + // Full-width punctuation to ASCII punctuation + EXPECT_THAT(normalizer->NormalizeTerm("‘’.,!?:“”"), Eq("''.,!?:\"\"")); + // Full-width 0-9 + EXPECT_THAT(normalizer->NormalizeTerm("0123456789"), + Eq("0123456789")); + // Full-width A-Z + EXPECT_THAT(normalizer->NormalizeTerm( + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), + Eq("abcdefghijklmnopqrstuvwxyz")); + // Full-width a-z + EXPECT_THAT(normalizer->NormalizeTerm( + "abcdefghijklmnopqrstuvwxyz"), + Eq("abcdefghijklmnopqrstuvwxyz")); +} + +TEST(MapNormalizerTest, IdeographicToASCII) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + EXPECT_THAT(normalizer->NormalizeTerm(",。"), Eq(",.")); +} + +TEST(MapNormalizerTest, HiraganaToKatakana) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + EXPECT_THAT(normalizer->NormalizeTerm("あいうえお"), Eq("アイウエオ")); + EXPECT_THAT(normalizer->NormalizeTerm("かきくけこ"), Eq("カキクケコ")); + EXPECT_THAT(normalizer->NormalizeTerm("さしすせそ"), Eq("サシスセソ")); + EXPECT_THAT(normalizer->NormalizeTerm("たちつてと"), Eq("タチツテト")); + EXPECT_THAT(normalizer->NormalizeTerm("なにぬねの"), Eq("ナニヌネノ")); + EXPECT_THAT(normalizer->NormalizeTerm("はひふへほ"), Eq("ハヒフヘホ")); + EXPECT_THAT(normalizer->NormalizeTerm("まみむめも"), Eq("マミムメモ")); + EXPECT_THAT(normalizer->NormalizeTerm("やゆよ"), Eq("ヤユヨ")); + EXPECT_THAT(normalizer->NormalizeTerm("らりるれろ"), Eq("ラリルレロ")); + EXPECT_THAT(normalizer->NormalizeTerm("わゐゑを"), Eq("ワヰヱヲ")); + EXPECT_THAT(normalizer->NormalizeTerm("ん"), Eq("ン")); + EXPECT_THAT(normalizer->NormalizeTerm("がぎぐげご"), Eq("ガギグゲゴ")); + EXPECT_THAT(normalizer->NormalizeTerm("ざじずぜぞ"), Eq("ザジズゼゾ")); + EXPECT_THAT(normalizer->NormalizeTerm("だぢづでど"), Eq("ダヂヅデド")); + EXPECT_THAT(normalizer->NormalizeTerm("ばびぶべぼ"), Eq("バビブベボ")); + EXPECT_THAT(normalizer->NormalizeTerm("ぱぴぷぺぽ"), Eq("パピプペポ")); +} + +TEST(MapNormalizerTest, Truncate) { + { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/5)); + + // Won't be truncated + EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi")); + EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello")); + + // Truncated to length 5. + EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello")); + + // Each Japanese character has 3 bytes, so truncating to length 5 results in + // only 1 character. + EXPECT_THAT(normalizer->NormalizeTerm("キャンパス"), Eq("キ")); + + // Each Greek character has 2 bytes, so truncating to length 5 results in 2 + // character. + EXPECT_THAT(normalizer->NormalizeTerm("αβγδε"), Eq("αβ")); + } + + { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/2)); + // The Japanese character has 3 bytes, truncating it results in an empty + // string. + EXPECT_THAT(normalizer->NormalizeTerm("キ"), Eq("")); + } +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc new file mode 100644 index 0000000..c318036 --- /dev/null +++ b/icing/transform/map/normalization-map.cc @@ -0,0 +1,712 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/transform/map/normalization-map.h" + +#include <cstdint> +#include "icing/legacy/core/icing-packed-pod.h" + +namespace icing { +namespace lib { + +namespace { +// A pair representing the mapping of the 'from' character to 'to' character. +struct NormalizationPair { + // All the mapped characters can be stored in 2 bytes. + char16_t from; + char16_t to; +} __attribute__((packed)); + +// The following mappings contain multiple categories: +// 1. Hiragana -> Katakana, listed in the order of Hiragana chart rows. +// All regular and small Hiragana characters are mapped to Katakana. Note +// that half-width Katakana characters are not handled here. +// 2. Common full-width characters -> ASCII characters. +// Full-width characters in the Unicode range of [0xff01, 0xff5e] are mapped +// to the corresponding ASCII forms. +// 3. Common ideographic punctuation marks -> ASCII characters. +// Ideographic characters are in the Unicode range of [0x3000, 0x303f]. Here +// we list two that are frequently used in CJK and can be converted to ASCII. +// 4. Common diacritic Latin characters -> ASCII characters. +// We list most diacritic Latin characters within the Unicode range of +// [0x00c0, 0x017e], some from [0x01a0, 0x021b], and most from [0x1e00, +// 0x1ef9]. +// +// All the characters can be stored in a single UTF16 code unit, so we use +// char16_t to store them. Size of the following array is about 2.5KiB. +constexpr NormalizationPair kNormalizationMappings[] = { + // Part 1: Hiragana -> Katakana + // 'a' row + {0x3042, 0x30a2}, // Hiragana letter A -> Katakana letter A + {0x3044, 0x30a4}, // Hiragana letter I -> Katakana letter I + {0x3046, 0x30a6}, // Hiragana letter U -> Katakana letter U + {0x3048, 0x30a8}, // Hiragana letter E -> Katakana letter E + {0x304a, 0x30aa}, // Hiragana letter O -> Katakana letter O + {0x3041, 0x30a2}, // Hiragana letter small A -> Katakana letter A + {0x3043, 0x30a4}, // Hiragana letter small I -> Katakana letter I + {0x3045, 0x30a6}, // Hiragana letter small U -> Katakana letter U + {0x3047, 0x30a8}, // Hiragana letter small E -> Katakana letter E + {0x3049, 0x30aa}, // Hiragana letter small O -> Katakana letter O + // 'ka' row + {0x304b, 0x30ab}, // Hiragana letter KA -> Katakana letter KA + {0x304d, 0x30ad}, // Hiragana letter KI -> Katakana letter KI + {0x304f, 0x30af}, // Hiragana letter KU -> Katakana letter KU + {0x3051, 0x30b1}, // Hiragana letter KE -> Katakana letter KE + {0x3053, 0x30b3}, // Hiragana letter KO -> Katakana letter KO + {0x3095, 0x30ab}, // Hiragana letter small KA -> Katakana letter KA + {0x3096, 0x30b1}, // Hiragana letter small KE -> Katakana letter KE + // 'sa' row + {0x3055, 0x30b5}, // Hiragana letter SA -> Katakana letter SA + {0x3057, 0x30b7}, // Hiragana letter SI -> Katakana letter SI + {0x3059, 0x30b9}, // Hiragana letter SU -> Katakana letter SU + {0x305b, 0x30bb}, // Hiragana letter SE -> Katakana letter SE + {0x305d, 0x30bd}, // Hiragana letter SO -> Katakana letter SO + // 'ta' row + {0x305f, 0x30bf}, // Hiragana letter TA -> Katakana letter TA + {0x3061, 0x30c1}, // Hiragana letter TI -> Katakana letter TI + {0x3063, 0x30c4}, // Hiragana letter small TU -> Katakana letter TU + {0x3064, 0x30c4}, // Hiragana letter TU -> Katakana letter TU + {0x3066, 0x30c6}, // Hiragana letter TE -> Katakana letter TE + {0x3068, 0x30c8}, // Hiragana letter TO -> Katakana letter TO + // 'na' row + {0x306a, 0x30ca}, // Hiragana letter NA -> Katakana letter NA + {0x306b, 0x30cb}, // Hiragana letter NI -> Katakana letter NI + {0x306c, 0x30cc}, // Hiragana letter NU -> Katakana letter NU + {0x306d, 0x30cd}, // Hiragana letter NE -> Katakana letter NE + {0x306e, 0x30ce}, // Hiragana letter NO -> Katakana letter NO + // 'ha' row + {0x306f, 0x30cf}, // Hiragana letter HA -> Katakana letter HA + {0x3072, 0x30d2}, // Hiragana letter HI -> Katakana letter HI + {0x3075, 0x30d5}, // Hiragana letter HU -> Katakana letter HU + {0x3078, 0x30d8}, // Hiragana letter HE -> Katakana letter HE + {0x307b, 0x30db}, // Hiragana letter HO -> Katakana letter HO + // 'ma' row + {0x307e, 0x30de}, // Hiragana letter MA -> Katakana letter MA + {0x307f, 0x30df}, // Hiragana letter MI -> Katakana letter MI + {0x3080, 0x30e0}, // Hiragana letter MU -> Katakana letter MU + {0x3081, 0x30e1}, // Hiragana letter ME -> Katakana letter ME + {0x3082, 0x30e2}, // Hiragana letter MO -> Katakana letter MO + // 'ya' row + {0x3083, 0x30e4}, // Hiragana letter small YA -> Katakana letter YA + {0x3084, 0x30e4}, // Hiragana letter YA -> Katakana letter YA + {0x3085, 0x30e6}, // Hiragana letter small YU -> Katakana letter YU + {0x3086, 0x30e6}, // Hiragana letter YU -> Katakana letter YU + {0x3087, 0x30e8}, // Hiragana letter small YO -> Katakana letter YO + {0x3088, 0x30e8}, // Hiragana letter YO -> Katakana letter YO + // 'ra' row + {0x3089, 0x30e9}, // Hiragana letter RA -> Katakana letter RA + {0x308a, 0x30ea}, // Hiragana letter RI -> Katakana letter RI + {0x308b, 0x30eb}, // Hiragana letter RU -> Katakana letter RU + {0x308c, 0x30ec}, // Hiragana letter RE -> Katakana letter RE + {0x308d, 0x30ed}, // Hiragana letter RO -> Katakana letter RO + // 'wa' row + {0x308e, 0x30ef}, // Hiragana letter small WA -> Katakana letter WA + {0x308f, 0x30ef}, // Hiragana letter WA -> Katakana letter WA + {0x3090, 0x30f0}, // Hiragana letter WI -> Katakana letter WI + {0x3091, 0x30f1}, // Hiragana letter WE -> Katakana letter WE + {0x3092, 0x30f2}, // Hiragana letter WO -> Katakana letter WO + // 'n' + {0x3093, 0x30f3}, // Hiragana letter N -> Katakana letter N + // 'ga' row + {0x304c, 0x30ac}, // Hiragana letter GA -> Katakana letter GA + {0x304e, 0x30ae}, // Hiragana letter GI -> Katakana letter GI + {0x3050, 0x30b0}, // Hiragana letter GU -> Katakana letter GU + {0x3052, 0x30b2}, // Hiragana letter GE -> Katakana letter GE + {0x3054, 0x30b4}, // Hiragana letter GO -> Katakana letter GO + // 'za' row + {0x3056, 0x30b6}, // Hiragana letter ZA -> Katakana letter ZA + {0x3058, 0x30b8}, // Hiragana letter ZI -> Katakana letter ZI + {0x305a, 0x30ba}, // Hiragana letter ZU -> Katakana letter ZU + {0x305c, 0x30bc}, // Hiragana letter ZE -> Katakana letter ZE + {0x305e, 0x30be}, // Hiragana letter ZO -> Katakana letter ZO + // 'da' row + {0x3060, 0x30c0}, // Hiragana letter DA -> Katakana letter DA + {0x3062, 0x30c2}, // Hiragana letter DI -> Katakana letter DI + {0x3065, 0x30c5}, // Hiragana letter DU -> Katakana letter DU + {0x3067, 0x30c7}, // Hiragana letter DE -> Katakana letter DE + {0x3069, 0x30c9}, // Hiragana letter DO -> Katakana letter DO + // 'ba' row + {0x3070, 0x30d0}, // Hiragana letter BA -> Katakana letter BA + {0x3073, 0x30d3}, // Hiragana letter BI -> Katakana letter BI + {0x3076, 0x30d6}, // Hiragana letter BU -> Katakana letter BU + {0x3079, 0x30d9}, // Hiragana letter BE -> Katakana letter BE + {0x307c, 0x30dc}, // Hiragana letter BO -> Katakana letter BO + // 'pa' row + {0x3071, 0x30d1}, // Hiragana letter PA -> Katakana letter PA + {0x3074, 0x30d4}, // Hiragana letter PI -> Katakana letter PI + {0x3077, 0x30d7}, // Hiragana letter PU -> Katakana letter PU + {0x307a, 0x30da}, // Hiragana letter PE -> Katakana letter PE + {0x307d, 0x30dd}, // Hiragana letter PO -> Katakana letter PO + // Additional Hiragana + {0x3094, 0x30f4}, // Hiragana letter VU -> Katakana letter VU + // Part 2: Common full-width characters -> ASCII characters. + {0xff01, 33}, // ASCII ! + {0xff02, 34}, // ASCII " + {0xff03, 35}, // ASCII # + {0xff04, 36}, // ASCII $ + {0xff05, 37}, // ASCII % + {0xff06, 38}, // ASCII & + {0xff07, 39}, // ASCII ' + {0xff08, 40}, // ASCII ( + {0xff09, 41}, // ASCII ) + {0xff0a, 42}, // ASCII * + {0xff0b, 43}, // ASCII + + {0xff0c, 44}, // ASCII , + {0xff0d, 45}, // ASCII - + {0xff0e, 46}, // ASCII . + {0xff0f, 47}, // ASCII / + {0xff10, 48}, // ASCII 0 + {0xff11, 49}, // ASCII 1 + {0xff12, 50}, // ASCII 2 + {0xff13, 51}, // ASCII 3 + {0xff14, 52}, // ASCII 4 + {0xff15, 53}, // ASCII 5 + {0xff16, 54}, // ASCII 6 + {0xff17, 55}, // ASCII 7 + {0xff18, 56}, // ASCII 8 + {0xff19, 57}, // ASCII 9 + {0xff1a, 58}, // ASCII : + {0xff1b, 59}, // ASCII ; + {0xff1c, 60}, // ASCII < + {0xff1d, 61}, // ASCII = + {0xff1e, 62}, // ASCII > + {0xff1f, 63}, // ASCII ? + {0xff20, 64}, // ASCII @ + {0xff21, 65}, // ASCII A + {0xff22, 66}, // ASCII B + {0xff23, 67}, // ASCII C + {0xff24, 68}, // ASCII D + {0xff25, 69}, // ASCII E + {0xff26, 70}, // ASCII F + {0xff27, 71}, // ASCII G + {0xff28, 72}, // ASCII H + {0xff29, 73}, // ASCII I + {0xff2a, 74}, // ASCII J + {0xff2b, 75}, // ASCII K + {0xff2c, 76}, // ASCII L + {0xff2d, 77}, // ASCII M + {0xff2e, 78}, // ASCII N + {0xff2f, 79}, // ASCII O + {0xff30, 80}, // ASCII P + {0xff31, 81}, // ASCII Q + {0xff32, 82}, // ASCII R + {0xff33, 83}, // ASCII S + {0xff34, 84}, // ASCII T + {0xff35, 85}, // ASCII U + {0xff36, 86}, // ASCII V + {0xff37, 87}, // ASCII W + {0xff38, 88}, // ASCII X + {0xff39, 89}, // ASCII Y + {0xff3a, 90}, // ASCII Z + {0xff3b, 91}, // ASCII [ + {0xff3c, 92}, // ASCII forward slash + {0xff3d, 93}, // ASCII ] + {0xff3e, 94}, // ASCII ^ + {0xff3f, 95}, // ASCII _ + {0xff40, 96}, // ASCII ` + {0xff41, 97}, // ASCII a + {0xff42, 98}, // ASCII b + {0xff43, 99}, // ASCII c + {0xff44, 100}, // ASCII d + {0xff45, 101}, // ASCII e + {0xff46, 102}, // ASCII f + {0xff47, 103}, // ASCII g + {0xff48, 104}, // ASCII h + {0xff49, 105}, // ASCII i + {0xff4a, 106}, // ASCII j + {0xff4b, 107}, // ASCII k + {0xff4c, 108}, // ASCII l + {0xff4d, 109}, // ASCII m + {0xff4e, 110}, // ASCII n + {0xff4f, 111}, // ASCII o + {0xff50, 112}, // ASCII p + {0xff51, 113}, // ASCII q + {0xff52, 114}, // ASCII r + {0xff53, 115}, // ASCII s + {0xff54, 116}, // ASCII t + {0xff55, 117}, // ASCII u + {0xff56, 118}, // ASCII v + {0xff57, 119}, // ASCII w + {0xff58, 120}, // ASCII x + {0xff59, 121}, // ASCII y + {0xff5a, 122}, // ASCII z + {0xff5b, 123}, // ASCII { + {0xff5c, 124}, // ASCII | + {0xff5d, 125}, // ASCII } + {0xff5e, 126}, // ASCII ~ + {0x2018, 39}, // Left single quote -> ASCII apostrophe + {0x2019, 39}, // Right single quote -> ASCII apostrophe + {0x201c, 34}, // Left double quote -> ASCII quote + {0x201d, 34}, // Right double quote -> ASCII quote + // Part 3: Common ideographic punctuation marks -> ASCII. + // Usually used in CJK. + {0x3001, 44}, // ASCII , + {0x3002, 46}, // ASCII . + // Part 4: Common diacritic Latin characters -> ASCII characters. + {0x00c0, 65}, // À -> A + {0x00c1, 65}, // Á -> A + {0x00c2, 65}, // Â -> A + {0x00c3, 65}, // Ã -> A + {0x00c4, 65}, // Ä -> A + {0x00c5, 65}, // Å -> A + {0x00c7, 67}, // Ç -> C + {0x00c8, 69}, // È -> E + {0x00c9, 69}, // É -> E + {0x00ca, 69}, // Ê -> E + {0x00cb, 69}, // Ë -> E + {0x00cc, 73}, // Ì -> I + {0x00cd, 73}, // Í -> I + {0x00ce, 73}, // Î -> I + {0x00cf, 73}, // Ï -> I + {0x00d0, 68}, // Ð -> D + {0x00d1, 78}, // Ñ -> N + {0x00d2, 79}, // Ò -> O + {0x00d3, 79}, // Ó -> O + {0x00d4, 79}, // Ô -> O + {0x00d5, 79}, // Õ -> O + {0x00d6, 79}, // Ö -> O + {0x00d8, 79}, // Ø -> O + {0x00d9, 85}, // Ù -> U + {0x00da, 85}, // Ú -> U + {0x00db, 85}, // Û -> U + {0x00dc, 85}, // Ü -> U + {0x00dd, 89}, // Ý -> Y + {0x00e0, 97}, // à -> a + {0x00e1, 97}, // á -> a + {0x00e2, 97}, // â -> a + {0x00e3, 97}, // ã -> a + {0x00e4, 97}, // ä -> a + {0x00e5, 97}, // å -> a + {0x00e7, 99}, // ç -> c + {0x00e8, 101}, // è -> e + {0x00e9, 101}, // é -> e + {0x00ea, 101}, // ê -> e + {0x00eb, 101}, // ë -> e + {0x00ec, 105}, // ì -> i + {0x00ed, 105}, // í -> i + {0x00ee, 105}, // î -> i + {0x00ef, 105}, // ï -> i + {0x00f0, 100}, // ð -> d + {0x00f1, 110}, // ñ -> n + {0x00f2, 111}, // ò -> o + {0x00f3, 111}, // ó -> o + {0x00f4, 111}, // ô -> o + {0x00f5, 111}, // õ -> o + {0x00f6, 111}, // ö -> o + {0x00f8, 111}, // ø -> o + {0x00f9, 117}, // ù -> u + {0x00fa, 117}, // ú -> u + {0x00fb, 117}, // û -> u + {0x00fc, 117}, // ü -> u + {0x00fd, 121}, // ý -> y + {0x00ff, 121}, // ÿ -> y + {0x0100, 65}, // Ā -> A + {0x0101, 97}, // ā -> a + {0x0102, 65}, // Ă -> A + {0x0103, 97}, // ă -> a + {0x0104, 65}, // Ą -> A + {0x0105, 97}, // ą -> a + {0x0106, 67}, // Ć -> C + {0x0107, 99}, // ć -> c + {0x0108, 67}, // Ĉ -> C + {0x0109, 99}, // ĉ -> c + {0x010a, 67}, // Ċ -> C + {0x010b, 99}, // ċ -> c + {0x010c, 67}, // Č -> C + {0x010d, 99}, // č -> c + {0x010e, 68}, // Ď -> D + {0x010f, 100}, // ď -> d + {0x0110, 68}, // Đ -> D + {0x0111, 100}, // đ -> d + {0x0112, 69}, // Ē -> E + {0x0113, 101}, // ē -> e + {0x0114, 69}, // Ĕ -> E + {0x0115, 101}, // ĕ -> e + {0x0116, 69}, // Ė -> E + {0x0117, 101}, // ė -> e + {0x0118, 69}, // Ę -> E + {0x0119, 101}, // ę -> e + {0x011a, 69}, // Ě -> E + {0x011b, 101}, // ě -> e + {0x011c, 71}, // Ĝ -> G + {0x011d, 103}, // ĝ -> g + {0x011e, 71}, // Ğ -> G + {0x011f, 103}, // ğ -> g + {0x0120, 71}, // Ġ -> G + {0x0121, 103}, // ġ -> g + {0x0122, 71}, // Ģ -> G + {0x0123, 103}, // ģ -> g + {0x0124, 72}, // Ĥ -> H + {0x0125, 104}, // ĥ -> h + {0x0126, 72}, // Ħ -> H + {0x0127, 104}, // ħ -> h + {0x0128, 73}, // Ĩ -> I + {0x0129, 105}, // ĩ -> i + {0x012a, 73}, // Ī -> I + {0x012b, 105}, // ī -> i + {0x012c, 73}, // Ĭ -> I + {0x012d, 105}, // ĭ -> i + {0x012e, 73}, // Į -> I + {0x012f, 105}, // į -> i + {0x0130, 73}, // İ -> I + {0x0131, 105}, // ı -> i + {0x0134, 74}, // Ĵ -> J + {0x0135, 106}, // ĵ -> j + {0x0136, 75}, // Ķ -> K + {0x0137, 107}, // ķ -> k + {0x0139, 76}, // Ĺ -> L + {0x013a, 108}, // ĺ -> l + {0x013b, 76}, // Ļ -> L + {0x013c, 108}, // ļ -> l + {0x013d, 76}, // Ľ -> L + {0x013e, 108}, // ľ -> l + {0x013f, 76}, // Ŀ -> L + {0x0140, 108}, // ŀ -> l + {0x0141, 76}, // Ł -> L + {0x0142, 108}, // ł -> l + {0x0143, 78}, // Ń -> N + {0x0144, 110}, // ń -> n + {0x0145, 78}, // Ņ -> N + {0x0146, 110}, // ņ -> n + {0x0147, 78}, // Ň -> N + {0x0148, 110}, // ň -> n + {0x014a, 78}, // Ŋ -> N + {0x014b, 110}, // ŋ -> n + {0x014c, 79}, // Ō -> O + {0x014d, 111}, // ō -> o + {0x014e, 79}, // Ŏ -> O + {0x014f, 111}, // ŏ -> o + {0x0150, 79}, // Ő -> O + {0x0151, 111}, // ő -> o + {0x0154, 82}, // Ŕ -> R + {0x0155, 114}, // ŕ -> r + {0x0156, 82}, // Ŗ -> R + {0x0157, 114}, // ŗ -> r + {0x0158, 82}, // Ř -> R + {0x0159, 114}, // ř -> r + {0x015a, 83}, // Ś -> S + {0x015b, 115}, // ś -> s + {0x015c, 83}, // Ŝ -> S + {0x015d, 115}, // ŝ -> s + {0x015e, 83}, // Ş -> S + {0x015f, 115}, // ş -> s + {0x0160, 83}, // Š -> S + {0x0161, 115}, // š -> s + {0x0162, 84}, // Ţ -> T + {0x0163, 116}, // ţ -> t + {0x0164, 84}, // Ť -> T + {0x0165, 116}, // ť -> t + {0x0166, 84}, // Ŧ -> T + {0x0167, 116}, // ŧ -> t + {0x0168, 85}, // Ũ -> U + {0x0169, 117}, // ũ -> u + {0x016a, 85}, // Ū -> U + {0x016b, 117}, // ū -> u + {0x016c, 85}, // Ŭ -> U + {0x016d, 117}, // ŭ -> u + {0x016e, 85}, // Ů -> U + {0x016f, 117}, // ů -> u + {0x0170, 85}, // Ű -> U + {0x0171, 117}, // ű -> u + {0x0172, 85}, // Ų -> U + {0x0173, 117}, // ų -> u + {0x0174, 87}, // Ŵ -> W + {0x0175, 119}, // ŵ -> w + {0x0176, 89}, // Ŷ -> Y + {0x0177, 121}, // ŷ -> y + {0x0178, 89}, // Ÿ -> Y + {0x0179, 90}, // Ź -> Z + {0x017a, 122}, // ź -> z + {0x017b, 90}, // Ż -> Z + {0x017c, 122}, // ż -> z + {0x017d, 90}, // Ž -> Z + {0x017e, 122}, // ž -> z + {0x01a0, 79}, // Ơ -> O + {0x01a1, 111}, // ơ -> o + {0x01af, 85}, // Ư -> U + {0x01b0, 117}, // ư -> u + {0x01b5, 90}, // Ƶ -> Z + {0x01b6, 122}, // ƶ -> z + {0x0218, 83}, // Ș -> S + {0x0219, 115}, // ș -> s + {0x021a, 84}, // Ț -> T + {0x021b, 116}, // ț -> t + {0x1e00, 65}, // Ḁ -> A + {0x1e01, 97}, // ḁ -> a + {0x1e02, 66}, // Ḃ -> B + {0x1e03, 98}, // ḃ -> b + {0x1e04, 66}, // Ḅ -> B + {0x1e05, 98}, // ḅ -> b + {0x1e06, 66}, // Ḇ -> B + {0x1e07, 98}, // ḇ -> b + {0x1e08, 67}, // Ḉ -> C + {0x1e09, 99}, // ḉ -> c + {0x1e0a, 68}, // Ḋ -> D + {0x1e0b, 100}, // ḋ -> d + {0x1e0c, 68}, // Ḍ -> D + {0x1e0d, 100}, // ḍ -> d + {0x1e0e, 68}, // Ḏ -> D + {0x1e0f, 100}, // ḏ -> d + {0x1e10, 68}, // Ḑ -> D + {0x1e11, 100}, // ḑ -> d + {0x1e12, 68}, // Ḓ -> D + {0x1e13, 100}, // ḓ -> d + {0x1e14, 69}, // Ḕ -> E + {0x1e15, 101}, // ḕ -> e + {0x1e16, 69}, // Ḗ -> E + {0x1e17, 101}, // ḗ -> e + {0x1e18, 69}, // Ḙ -> E + {0x1e19, 101}, // ḙ -> e + {0x1e1a, 69}, // Ḛ -> E + {0x1e1b, 101}, // ḛ -> e + {0x1e1c, 69}, // Ḝ -> E + {0x1e1d, 101}, // ḝ -> e + {0x1e1e, 70}, // Ḟ -> F + {0x1e1f, 102}, // ḟ -> f + {0x1e20, 71}, // Ḡ -> G + {0x1e21, 103}, // ḡ -> g + {0x1e22, 72}, // Ḣ -> H + {0x1e23, 104}, // ḣ -> h + {0x1e24, 72}, // Ḥ -> H + {0x1e25, 104}, // ḥ -> h + {0x1e26, 72}, // Ḧ -> H + {0x1e27, 104}, // ḧ -> h + {0x1e28, 72}, // Ḩ -> H + {0x1e29, 104}, // ḩ -> h + {0x1e2a, 72}, // Ḫ -> H + {0x1e2b, 104}, // ḫ -> h + {0x1e2c, 73}, // Ḭ -> I + {0x1e2d, 105}, // ḭ -> i + {0x1e2e, 73}, // Ḯ -> I + {0x1e2f, 105}, // ḯ -> i + {0x1e30, 75}, // Ḱ -> K + {0x1e31, 107}, // ḱ -> k + {0x1e32, 75}, // Ḳ -> K + {0x1e33, 107}, // ḳ -> k + {0x1e34, 75}, // Ḵ -> K + {0x1e35, 107}, // ḵ -> k + {0x1e36, 76}, // Ḷ -> L + {0x1e37, 108}, // ḷ -> l + {0x1e38, 76}, // Ḹ -> L + {0x1e39, 108}, // ḹ -> l + {0x1e3b, 108}, // ḻ -> l + {0x1e3c, 76}, // Ḽ -> L + {0x1e3d, 108}, // ḽ -> l + {0x1e3e, 77}, // Ḿ -> M + {0x1e3f, 109}, // ḿ -> m + {0x1e40, 77}, // Ṁ -> M + {0x1e41, 109}, // ṁ -> m + {0x1e42, 77}, // Ṃ -> M + {0x1e43, 109}, // ṃ -> m + {0x1e44, 78}, // Ṅ -> N + {0x1e45, 110}, // ṅ -> n + {0x1e46, 78}, // Ṇ -> N + {0x1e47, 110}, // ṇ -> n + {0x1e48, 78}, // Ṉ -> N + {0x1e49, 110}, // ṉ -> n + {0x1e4a, 78}, // Ṋ -> N + {0x1e4b, 110}, // ṋ -> n + {0x1e4c, 79}, // Ṍ -> O + {0x1e4d, 111}, // ṍ -> o + {0x1e4e, 79}, // Ṏ -> O + {0x1e4f, 111}, // ṏ -> o + {0x1e50, 79}, // Ṑ -> O + {0x1e51, 111}, // ṑ -> o + {0x1e52, 79}, // Ṓ -> O + {0x1e53, 111}, // ṓ -> o + {0x1e54, 80}, // Ṕ -> P + {0x1e55, 112}, // ṕ -> p + {0x1e56, 80}, // Ṗ -> P + {0x1e57, 112}, // ṗ -> p + {0x1e58, 82}, // Ṙ -> R + {0x1e59, 114}, // ṙ -> r + {0x1e5a, 82}, // Ṛ -> R + {0x1e5b, 114}, // ṛ -> r + {0x1e5c, 82}, // Ṝ -> R + {0x1e5d, 114}, // ṝ -> r + {0x1e5e, 82}, // Ṟ -> R + {0x1e5f, 114}, // ṟ -> r + {0x1e60, 83}, // Ṡ -> S + {0x1e61, 115}, // ṡ -> s + {0x1e62, 83}, // Ṣ -> S + {0x1e63, 115}, // ṣ -> s + {0x1e64, 83}, // Ṥ -> S + {0x1e65, 115}, // ṥ -> s + {0x1e66, 83}, // Ṧ -> S + {0x1e67, 115}, // ṧ -> s + {0x1e68, 83}, // Ṩ -> S + {0x1e69, 115}, // ṩ -> s + {0x1e6a, 84}, // Ṫ -> T + {0x1e6b, 116}, // ṫ -> t + {0x1e6c, 84}, // Ṭ -> T + {0x1e6d, 116}, // ṭ -> t + {0x1e6e, 84}, // Ṯ -> T + {0x1e6f, 116}, // ṯ -> t + {0x1e70, 84}, // Ṱ -> T + {0x1e71, 116}, // ṱ -> t + {0x1e72, 85}, // Ṳ -> U + {0x1e73, 117}, // ṳ -> u + {0x1e74, 85}, // Ṵ -> U + {0x1e75, 117}, // ṵ -> u + {0x1e76, 85}, // Ṷ -> U + {0x1e77, 117}, // ṷ -> u + {0x1e78, 85}, // Ṹ -> U + {0x1e79, 117}, // ṹ -> u + {0x1e7a, 85}, // Ṻ -> U + {0x1e7b, 117}, // ṻ -> u + {0x1e7c, 86}, // Ṽ -> V + {0x1e7d, 118}, // ṽ -> v + {0x1e7e, 86}, // Ṿ -> V + {0x1e7f, 118}, // ṿ -> v + {0x1e80, 87}, // Ẁ -> W + {0x1e81, 119}, // ẁ -> w + {0x1e82, 87}, // Ẃ -> W + {0x1e83, 119}, // ẃ -> w + {0x1e84, 87}, // Ẅ -> W + {0x1e85, 119}, // ẅ -> w + {0x1e86, 87}, // Ẇ -> W + {0x1e87, 119}, // ẇ -> w + {0x1e88, 87}, // Ẉ -> W + {0x1e89, 119}, // ẉ -> w + {0x1e8a, 88}, // Ẋ -> X + {0x1e8b, 120}, // ẋ -> x + {0x1e8c, 88}, // Ẍ -> X + {0x1e8d, 120}, // ẍ -> x + {0x1e8e, 89}, // Ẏ -> Y + {0x1e8f, 121}, // ẏ -> y + {0x1e90, 90}, // Ẑ -> Z + {0x1e91, 122}, // ẑ -> z + {0x1e92, 90}, // Ẓ -> Z + {0x1e93, 122}, // ẓ -> z + {0x1e94, 90}, // Ẕ -> Z + {0x1e95, 122}, // ẕ -> z + {0x1e96, 104}, // ẖ -> h + {0x1e97, 116}, // ẗ -> t + {0x1e98, 119}, // ẘ -> w + {0x1e99, 121}, // ẙ -> y + {0x1e9a, 97}, // ẚ -> a + {0x1e9b, 102}, // ẛ -> f + {0x1ea0, 65}, // Ạ -> A + {0x1ea1, 97}, // ạ -> a + {0x1ea2, 65}, // Ả -> A + {0x1ea3, 97}, // ả -> a + {0x1ea4, 65}, // Ấ -> A + {0x1ea5, 97}, // ấ -> a + {0x1ea6, 65}, // Ầ -> A + {0x1ea7, 97}, // ầ -> a + {0x1ea8, 65}, // Ẩ -> A + {0x1ea9, 97}, // ẩ -> a + {0x1eaa, 65}, // Ẫ -> A + {0x1eab, 97}, // ẫ -> a + {0x1eac, 65}, // Ậ -> A + {0x1ead, 97}, // ậ -> a + {0x1eae, 65}, // Ắ -> A + {0x1eaf, 97}, // ắ -> a + {0x1eb0, 65}, // Ằ -> A + {0x1eb1, 97}, // ằ -> a + {0x1eb2, 65}, // Ẳ -> A + {0x1eb3, 97}, // ẳ -> a + {0x1eb4, 65}, // Ẵ -> A + {0x1eb5, 97}, // ẵ -> a + {0x1eb6, 65}, // Ặ -> A + {0x1eb7, 97}, // ặ -> a + {0x1eb8, 69}, // Ẹ -> E + {0x1eb9, 101}, // ẹ -> e + {0x1eba, 69}, // Ẻ -> E + {0x1ebb, 101}, // ẻ -> e + {0x1ebc, 69}, // Ẽ -> E + {0x1ebd, 101}, // ẽ -> e + {0x1ebe, 69}, // Ế -> E + {0x1ebf, 101}, // ế -> e + {0x1ec0, 69}, // Ề -> E + {0x1ec1, 101}, // ề -> e + {0x1ec2, 69}, // Ể -> E + {0x1ec3, 101}, // ể -> e + {0x1ec4, 69}, // Ễ -> E + {0x1ec5, 101}, // ễ -> e + {0x1ec6, 69}, // Ệ -> E + {0x1ec7, 101}, // ệ -> e + {0x1ec8, 73}, // Ỉ -> I + {0x1ec9, 105}, // ỉ -> i + {0x1eca, 73}, // Ị -> I + {0x1ecb, 105}, // ị -> i + {0x1ecc, 79}, // Ọ -> O + {0x1ecd, 111}, // ọ -> o + {0x1ece, 79}, // Ỏ -> O + {0x1ecf, 111}, // ỏ -> o + {0x1ed0, 79}, // Ố -> O + {0x1ed1, 111}, // ố -> o + {0x1ed2, 79}, // Ồ -> O + {0x1ed3, 111}, // ồ -> o + {0x1ed4, 79}, // Ổ -> O + {0x1ed5, 111}, // ổ -> o + {0x1ed6, 79}, // Ỗ -> O + {0x1ed7, 111}, // ỗ -> o + {0x1ed8, 79}, // Ộ -> O + {0x1ed9, 111}, // ộ -> o + {0x1eda, 79}, // Ớ -> O + {0x1edb, 111}, // ớ -> o + {0x1edc, 79}, // Ờ -> O + {0x1edd, 111}, // ờ -> o + {0x1ede, 79}, // Ở -> O + {0x1edf, 111}, // ở -> o + {0x1ee0, 79}, // Ỡ -> O + {0x1ee1, 111}, // ỡ -> o + {0x1ee2, 79}, // Ợ -> O + {0x1ee3, 111}, // ợ -> o + {0x1ee4, 85}, // Ụ -> U + {0x1ee5, 117}, // ụ -> u + {0x1ee6, 85}, // Ủ -> U + {0x1ee7, 117}, // ủ -> u + {0x1ee8, 85}, // Ứ -> U + {0x1ee9, 117}, // ứ -> u + {0x1eea, 85}, // Ừ -> U + {0x1eeb, 117}, // ừ -> u + {0x1eec, 85}, // Ử -> U + {0x1eed, 117}, // ử -> u + {0x1eee, 85}, // Ữ -> U + {0x1eef, 117}, // ữ -> u + {0x1ef0, 85}, // Ự -> U + {0x1ef1, 117}, // ự -> u + {0x1ef2, 89}, // Ỳ -> Y + {0x1ef3, 121}, // ỳ -> y + {0x1ef4, 89}, // Ỵ -> Y + {0x1ef5, 121}, // ỵ -> y + {0x1ef6, 89}, // Ỷ -> Y + {0x1ef7, 121}, // ỷ -> y + {0x1ef8, 89}, // Ỹ -> Y + {0x1ef9, 121}, // ỹ -> y +}; + +} // namespace + +const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() { + // The map is allocated dynamically the first time this function is executed. + static const std::unordered_map<char16_t, char16_t> normalization_map = [] { + std::unordered_map<char16_t, char16_t> map; + // Size of all the mappings is about 2.5 KiB. + constexpr int numMappings = + sizeof(kNormalizationMappings) / sizeof(NormalizationPair); + map.reserve(numMappings); + for (size_t i = 0; i < numMappings; ++i) { + map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to); + } + return map; + }(); + + return normalization_map; +} + +} // namespace lib +} // namespace icing diff --git a/icing/util/icu-i18n-utils_test.cc b/icing/transform/map/normalization-map.h index f5864df..aea85bd 100644 --- a/icing/util/icu-i18n-utils_test.cc +++ b/icing/transform/map/normalization-map.h @@ -12,31 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/util/icu-i18n-utils.h" +#ifndef ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_ +#define ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_ -#include <memory> - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "unicode/uchar.h" +#include <unordered_map> namespace icing { namespace lib { -namespace icu_i18n_utils { -namespace { - -TEST(IcuI18nUtilsTest, IsPunctuationAtSameAsIcuIsPunct) { - // Iterate through ASCII values - for (int i = 0; i <= 127; ++i) { - char ascii = i; - std::string ascii_string = ""; - ascii_string.push_back(ascii); +// Returns a map containing normalization mappings. A mapping (A -> B) means +// that we'll transform every character 'A' into 'B'. See normalization-map.cc +// for mapping details. +const std::unordered_map<char16_t, char16_t>& GetNormalizationMap(); - EXPECT_EQ(IsPunctuationAt(ascii_string, /*position=*/0), u_ispunct(ascii)); - } -} -} // namespace -} // namespace icu_i18n_utils } // namespace lib } // namespace icing + +#endif // ICING_TRANSFORM_MAP_NORMALIZATION_MAP_H_ diff --git a/icing/transform/normalizer-factory.h b/icing/transform/normalizer-factory.h index 9119897..f1f3f62 100644 --- a/icing/transform/normalizer-factory.h +++ b/icing/transform/normalizer-factory.h @@ -16,12 +16,9 @@ #define ICING_TRANSFORM_NORMALIZER_FACTORY_H_ #include <memory> -#include <string_view> #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" -#include "icing/transform/icu-normalizer.h" -#include "icing/transform/none-normalizer.h" #include "icing/transform/normalizer.h" namespace icing { @@ -29,11 +26,6 @@ namespace lib { namespace normalizer_factory { -enum NormalizerType { - ICU4C, // Normalizes using the ICU library. - NONE, // Doesn't perform normalization. Not for use in production. -}; - // Creates a normalizer. max_term_byte_size enforces the max size of text after // normalization, text will be truncated if exceeds the max size. // @@ -42,19 +34,7 @@ enum NormalizerType { // INVALID_ARGUMENT if max_term_byte_size <= 0 // INTERNAL_ERROR on errors libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( - NormalizerType type, int max_term_byte_size) { - if (max_term_byte_size <= 0) { - return absl_ports::InvalidArgumentError( - "max_term_byte_size must be greater than zero."); - } - - switch (type) { - case ICU4C: - return IcuNormalizer::Create(max_term_byte_size); - case NONE: - return std::make_unique<NoneNormalizer>(max_term_byte_size); - } -} + int max_term_byte_size); } // namespace normalizer_factory diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h index 817f530..4cbfa63 100644 --- a/icing/transform/normalizer.h +++ b/icing/transform/normalizer.h @@ -28,8 +28,7 @@ namespace lib { // // Example use: // ICING_ASSIGN_OR_RETURN(auto normalizer, -// normalizer_factory::Create(normalizer_factory::NormalizerType::ICU4C, -// /*max_term_byte_size=*/5); +// normalizer_factory::Create(/*max_term_byte_size=*/5); // // std::string normalized_text = normalizer->NormalizeText("HELLO!"); // ICING_LOG(INFO) << normalized_text; // prints "hello" diff --git a/icing/transform/simple/none-normalizer-factory.cc b/icing/transform/simple/none-normalizer-factory.cc new file mode 100644 index 0000000..6b35270 --- /dev/null +++ b/icing/transform/simple/none-normalizer-factory.cc @@ -0,0 +1,53 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ +#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ + +#include <memory> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/transform/normalizer.h" +#include "icing/transform/simple/none-normalizer.h" + +namespace icing { +namespace lib { + +namespace normalizer_factory { + +// Creates a dummy normalizer. The term is not normalized, but +// the text will be truncated to max_term_byte_size if it exceeds the max size. +// +// Returns: +// A normalizer on success +// INVALID_ARGUMENT if max_term_byte_size <= 0 +// INTERNAL_ERROR on errors +libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( + int max_term_byte_size) { + if (max_term_byte_size <= 0) { + return absl_ports::InvalidArgumentError( + "max_term_byte_size must be greater than zero."); + } + + return std::make_unique<NoneNormalizer>(max_term_byte_size); +} + +} // namespace normalizer_factory + +} // namespace lib +} // namespace icing + +#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ diff --git a/icing/transform/none-normalizer.h b/icing/transform/simple/none-normalizer.h index b734bef..47085e1 100644 --- a/icing/transform/none-normalizer.h +++ b/icing/transform/simple/none-normalizer.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_TRANSFORM_NONE_NORMALIZER_H_ -#define ICING_TRANSFORM_NONE_NORMALIZER_H_ +#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ +#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ #include <string> #include <string_view> @@ -30,7 +30,7 @@ namespace lib { // max_term_byte_size. class NoneNormalizer : public Normalizer { public: - NoneNormalizer(int max_term_byte_size) + explicit NoneNormalizer(int max_term_byte_size) : max_term_byte_size_(max_term_byte_size){}; std::string NormalizeTerm(std::string_view term) const override { @@ -48,4 +48,4 @@ class NoneNormalizer : public Normalizer { } // namespace lib } // namespace icing -#endif // ICING_TRANSFORM_NONE_NORMALIZER_H_ +#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ diff --git a/icing/transform/none-normalizer_test.cc b/icing/transform/simple/none-normalizer_test.cc index e322258..e074828 100644 --- a/icing/transform/none-normalizer_test.cc +++ b/icing/transform/simple/none-normalizer_test.cc @@ -27,25 +27,20 @@ namespace { using ::testing::Eq; TEST(NoneNormalizerTest, Creation) { - EXPECT_THAT( - normalizer_factory::Create(normalizer_factory::NormalizerType::NONE, - /*max_term_byte_size=*/5), - IsOk()); - EXPECT_THAT( - normalizer_factory::Create(normalizer_factory::NormalizerType::NONE, - /*max_term_byte_size=*/0), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT( - normalizer_factory::Create(normalizer_factory::NormalizerType::NONE, - /*max_term_byte_size=*/-1), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/5), + IsOk()); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/0), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(normalizer_factory::Create( + /*max_term_byte_size=*/-1), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(IcuNormalizerTest, NoNormalizationDone) { - ICING_ASSERT_OK_AND_ASSIGN( - auto normalizer, - normalizer_factory::Create(normalizer_factory::NormalizerType::NONE, - /*max_term_byte_size=*/1000)); + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); EXPECT_THAT(normalizer->NormalizeTerm(""), Eq("")); EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world")); @@ -63,10 +58,8 @@ TEST(IcuNormalizerTest, NoNormalizationDone) { } TEST(NoneNormalizerTest, Truncate) { - ICING_ASSERT_OK_AND_ASSIGN( - auto normalizer, - normalizer_factory::Create(normalizer_factory::NormalizerType::NONE, - /*max_term_byte_size=*/5)); + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/5)); // Won't be truncated EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi")); diff --git a/icing/util/document-validator.cc b/icing/util/document-validator.cc index 5b588e7..36b84f8 100644 --- a/icing/util/document-validator.cc +++ b/icing/util/document-validator.cc @@ -72,11 +72,9 @@ libtextclassifier3::Status DocumentValidator::Validate( const SchemaTypeConfigProto* type_config = std::move(type_config_or).ValueOrDie(); - int32_t num_required_properties_expected = 0; int32_t num_required_properties_actual = 0; - PropertyConfigMap property_config_map; - SchemaUtil::BuildPropertyConfigMap(*type_config, &property_config_map, - &num_required_properties_expected); + SchemaUtil::ParsedPropertyConfigs parsed_property_configs = + SchemaUtil::ParsePropertyConfigs(*type_config); std::unordered_set<std::string_view> unique_properties; for (const PropertyProto& property : document.properties()) { @@ -93,8 +91,9 @@ libtextclassifier3::Status DocumentValidator::Validate( document.namespace_(), ", ", document.uri(), ").")); } - const auto& property_iter = property_config_map.find(property.name()); - if (property_iter == property_config_map.end()) { + const auto& property_iter = + parsed_property_configs.property_config_map.find(property.name()); + if (property_iter == parsed_property_configs.property_config_map.end()) { return absl_ports::NotFoundError(absl_ports::StrCat( "Property config '", property.name(), "' not found for key: (", document.namespace_(), ", ", document.uri(), ").")); @@ -165,7 +164,8 @@ libtextclassifier3::Status DocumentValidator::Validate( } } } - if (num_required_properties_actual < num_required_properties_expected) { + if (num_required_properties_actual < + parsed_property_configs.num_required_properties) { return absl_ports::InvalidArgumentError( absl_ports::StrCat("One or more required fields missing for key: (", document.namespace_(), ", ", document.uri(), ").")); diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc index 2345339..9cf992f 100644 --- a/icing/util/i18n-utils.cc +++ b/icing/util/i18n-utils.cc @@ -17,6 +17,17 @@ #include <cctype> #include <string_view> +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/util/logging.h" +#include "unicode/uchar.h" +#include "unicode/umachine.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" +#include "unicode/utf8.h" +#include "unicode/utypes.h" + namespace icing { namespace lib { namespace i18n_utils { @@ -31,12 +42,84 @@ const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])"; } // namespace +libtextclassifier3::StatusOr<std::string> Utf16ToUtf8( + const std::u16string& utf16_string) { + std::string utf8_string; + // Allocates the maximum possible UTF8 string length: + // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL. + // + // NOTE: we need to call resize() but not reserve() because values can't be + // set at positions after length(). + utf8_string.resize(utf16_string.length() * 3 + 1); + + int result_length = 0; + UErrorCode status = U_ZERO_ERROR; + u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length, + utf16_string.data(), utf16_string.length(), &status); + // Corrects the length + utf8_string.resize(result_length); + + if (U_FAILURE(status)) { + return absl_ports::InternalError("Failed to convert UTF16 string to UTF8"); + } + return utf8_string; +} + +libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16( + std::string_view utf8_string) { + std::u16string utf16_result; + // The UTF16 string won't be longer than its UTF8 format + // + // NOTE: we need to call resize() but not reserve() because values can't be + // set at positions after length(). + utf16_result.resize(utf8_string.length()); + + int result_length = 0; + UErrorCode status = U_ZERO_ERROR; + u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length, + utf8_string.data(), utf8_string.length(), &status); + // Corrects the length + utf16_result.resize(result_length); + + if (U_FAILURE(status)) { + return absl_ports::InternalError(absl_ports::StrCat( + "Failed to convert UTF8 string '", utf8_string, "' to UTF16")); + } + return utf16_result; +} + UChar32 GetUChar32At(const char* data, int length, int position) { - // We don't handle Unicode, i.e. anything more than 1 byte. - return data[position]; + UChar32 uchar32; + U8_NEXT_OR_FFFD(data, position, length, uchar32); + return uchar32; } -bool IsAscii(char c) { return (c & 0x80) == 0; } +void SafeTruncateUtf8(std::string* str, int truncate_to_length) { + if (str == nullptr || truncate_to_length >= str->length()) { + return; + } + + while (truncate_to_length > 0) { + if (IsLeadUtf8Byte(str->at(truncate_to_length))) { + str->resize(truncate_to_length); + return; + } + truncate_to_length--; + } + + // Truncates to an empty string + str->resize(0); +} + +bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); } + +bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; } + +int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); } + +int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); } + +bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); } bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) { if (IsAscii(input[position])) { @@ -45,18 +128,43 @@ bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) { } return ascii_icu_punctuation.find(input[position]) != std::string::npos; } - - // If it's not ASCII, we can't process Unicode so we don't know. - return false; + UChar32 c = GetUChar32At(input.data(), input.length(), position); + if (char_len_out != nullptr) { + *char_len_out = U8_LENGTH(c); + } + return u_ispunct(c); } bool IsWhitespaceAt(std::string_view input, int position) { if (IsAscii(input[position])) { return std::isspace(input[position]); } + UChar32 c = GetUChar32At(input.data(), input.length(), position); + return u_isUWhiteSpace(c); +} + +bool IsAlphabeticAt(std::string_view input, int position) { + if (IsAscii(input[position])) { + return std::isalpha(input[position]); + } + UChar32 c = GetUChar32At(input.data(), input.length(), position); + return u_isUAlphabetic(c); +} - // If it's not ASCII, we can't process Unicode so we don't know. - return false; +void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) { + uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes + + int utf8_index = 0; + UBool has_error = FALSE; + + // utf8_index is advanced to the end of the contents if successful + U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error); + + if (has_error) { + ICING_LOG(WARNING) << "Error appending UChar32 to the UTF8 string."; + return; + } + utf8_string->append(reinterpret_cast<char*>(utf8_buffer), utf8_index); } } // namespace i18n_utils diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h index 141b9af..e103bab 100644 --- a/icing/util/i18n-utils.h +++ b/icing/util/i18n-utils.h @@ -18,43 +18,60 @@ #include <string> #include <string_view> +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "unicode/umachine.h" + namespace icing { namespace lib { -// These are included for uses when we don't have access to ICU. -// -// Defined in ICU; -// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/umachine_8h.html#a09fff5c3b5a5b015324dc3ec3cf92809 -using UChar32 = int32_t; - -// Defined in ICU: -// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/utf8_8h.html#aa2298b48749d9f45772c8f5a6885464a -#define U8_MAX_LENGTH 4 - -// Defined in ICU: -// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/uloc_8h.html#aa55404d3c725af4e05e65e5b40a6e13d -#define ULOC_US "en_US" - // Internationalization utils that use standard utilities or custom code. Does -// not require any special dependencies, i.e. for use when the library is NOT -// guaranteed to have access to ICU. -// -// Note: This does not handle Unicode. -// -// TODO(cassiewang): Figure out if we want to keep this file as a non-ICU -// solution long-term, or if we'll do something along the lines of reverse-jni, -// etc. +// not require any special dependencies, such as data files for ICU. namespace i18n_utils { // An invalid value defined by Unicode. static constexpr UChar32 kInvalidUChar32 = 0xFFFD; +// Converts a UTF16 string to a UTF8 string. +// +// Returns: +// A UTF8 string on success +// INTERNAL_ERROR on any failures +libtextclassifier3::StatusOr<std::string> Utf16ToUtf8( + const std::u16string& utf16_string); + +// Converts a UTF8 string to a UTF16 string. +// +// Returns: +// A UTF16 string on success +// INTERNAL_ERROR on any failures +libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16( + std::string_view utf8_string); + // Returns the char at the given position. UChar32 GetUChar32At(const char* data, int length, int position); +// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut +// in the middle. The string will be truncated in place. +void SafeTruncateUtf8(std::string* str, int truncate_to_length); + // Checks if the single char is within ASCII range. bool IsAscii(char c); +// Checks if the Unicode char is within ASCII range. +bool IsAscii(UChar32 c); + +// Returns how many code units (char) are used for the UTF-8 encoding of this +// Unicode character. Returns 0 if not valid. +int GetUtf8Length(UChar32 c); + +// Returns how many code units (char16_t) are used for the UTF-16 encoding of +// this Unicode character. Returns 0 if not valid. +int GetUtf16Length(UChar32 c); + +// Checks if the single char is the first byte of a UTF8 character, note +// that a single ASCII char is also considered a lead byte. +bool IsLeadUtf8Byte(char c); + // Checks if the character at position is punctuation. Assigns the length of the // character at position to *char_len_out if the character at position is valid // punctuation and char_len_out is not null. @@ -64,6 +81,11 @@ bool IsPunctuationAt(std::string_view input, int position, // Checks if the character at position is a whitespace. bool IsWhitespaceAt(std::string_view input, int position); +// Checks if the character at position is a whitespace. +bool IsAlphabeticAt(std::string_view input, int position); + +void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar); + } // namespace i18n_utils } // namespace lib } // namespace icing diff --git a/icing/util/i18n-utils_test.cc b/icing/util/i18n-utils_test.cc new file mode 100644 index 0000000..a1e8d4e --- /dev/null +++ b/icing/util/i18n-utils_test.cc @@ -0,0 +1,141 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/util/i18n-utils.h" + +#include <memory> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "unicode/uchar.h" + +namespace icing { +namespace lib { +namespace { + +using ::testing::Eq; + +TEST(IcuI18nUtilsTest, IsPunctuationAtSameAsIcuIsPunct) { + // Iterate through ASCII values + for (int i = 0; i <= 127; ++i) { + char ascii = i; + + std::string ascii_string = ""; + ascii_string.push_back(ascii); + + EXPECT_EQ(i18n_utils::IsPunctuationAt(ascii_string, /*position=*/0), + + u_ispunct(ascii)); + } +} + +TEST(IcuI18nUtilsTest, IsAlphabeticAt) { + // Test alphabetic and non-alphabetic ascii characters + constexpr std::string_view kSomeAscii = "iJ?9"; + EXPECT_TRUE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/0)); // 'i' + EXPECT_TRUE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/1)); // 'J' + EXPECT_FALSE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/2)); // '?' + EXPECT_FALSE(i18n_utils::IsAlphabeticAt(kSomeAscii, /*position=*/3)); // '9' + + constexpr std::string_view kSomeNonAscii = "👏ñ①カ"; + EXPECT_FALSE( + i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/0)); // '👏' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeNonAscii.data(), kSomeNonAscii.length(), 0)), + 4); + EXPECT_TRUE( + i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/4)); // 'ñ' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeNonAscii.data(), kSomeNonAscii.length(), 4)), + 2); + EXPECT_FALSE( + i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/6)); // '①' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeNonAscii.data(), kSomeNonAscii.length(), 6)), + 3); + EXPECT_TRUE( + i18n_utils::IsAlphabeticAt(kSomeNonAscii, /*position=*/9)); // 'カ' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeNonAscii.data(), kSomeNonAscii.length(), 9)), + 3); +} + +TEST(IcuI18nUtilsTest, GetUtf8Length) { + // Test alphabetic and non-alphabetic ascii characters + constexpr std::string_view kSomeAscii = "iJ?9"; + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeAscii.data(), kSomeAscii.length(), 0)), + 1); // 'i' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeAscii.data(), kSomeAscii.length(), 1)), + 1); // 'J' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeAscii.data(), kSomeAscii.length(), 2)), + 1); // '?' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeAscii.data(), kSomeAscii.length(), 3)), + 1); // '9' + + constexpr std::string_view kSomeNonAscii = "👏ñ①カ"; + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeNonAscii.data(), kSomeNonAscii.length(), 0)), + 4); // '👏' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeNonAscii.data(), kSomeNonAscii.length(), 4)), + 2); // 'ñ' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeNonAscii.data(), kSomeNonAscii.length(), 6)), + 3); // '①' + EXPECT_EQ(i18n_utils::GetUtf8Length(i18n_utils::GetUChar32At( + kSomeNonAscii.data(), kSomeNonAscii.length(), 9)), + 3); // 'カ' +} + +TEST(IcuI18nUtilsTest, SafeTruncate) { + // Test alphabetic and non-alphabetic ascii characters + constexpr std::string_view kSomeAscii = "iJ?9"; + std::string truncated(kSomeAscii); + i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length() + 1); + EXPECT_THAT(truncated, Eq("iJ?9")); + truncated = kSomeAscii; + i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length()); + EXPECT_THAT(truncated, Eq("iJ?9")); + truncated = kSomeAscii; + i18n_utils::SafeTruncateUtf8(&truncated, kSomeAscii.length() - 1); + EXPECT_THAT(truncated, Eq("iJ?")); + + constexpr std::string_view kSomeNonAscii = "👏ñ①カ"; + truncated = kSomeNonAscii; + i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() + 1); + EXPECT_THAT(truncated, Eq("👏ñ①カ")); + truncated = kSomeNonAscii; + i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length()); + EXPECT_THAT(truncated, Eq("👏ñ①カ")); + truncated = kSomeNonAscii; + i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 1); + EXPECT_THAT(truncated, Eq("👏ñ①")); + truncated = kSomeNonAscii; + i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 2); + EXPECT_THAT(truncated, Eq("👏ñ①")); + truncated = kSomeNonAscii; + i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 3); + EXPECT_THAT(truncated, Eq("👏ñ①")); + truncated = kSomeNonAscii; + i18n_utils::SafeTruncateUtf8(&truncated, kSomeNonAscii.length() - 4); + EXPECT_THAT(truncated, Eq("👏ñ")); +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/util/icu-i18n-utils.cc b/icing/util/icu-i18n-utils.cc deleted file mode 100644 index 89e4eec..0000000 --- a/icing/util/icu-i18n-utils.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/util/icu-i18n-utils.h" - -#include <cctype> -#include <string> -#include <string_view> - -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/absl_ports/canonical_errors.h" -#include "icing/absl_ports/str_cat.h" -#include "unicode/umachine.h" -#include "unicode/unorm2.h" -#include "unicode/ustring.h" -#include "unicode/utf8.h" - -namespace icing { -namespace lib { -namespace icu_i18n_utils { - -namespace { - -// All ASCII punctuation that's also in a Unicode Punctuation category -// (https://www.fileformat.info/info/unicode/category/index.htm). The set of -// characters that are regarded as punctuation is not the same for std::ispunct -// and u_ispunct. -const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])"; - -} // namespace - -libtextclassifier3::StatusOr<std::string> Utf16ToUtf8( - const std::u16string& utf16_string) { - std::string utf8_string; - // Allocates the maximum possible UTF8 string length: - // 3 UTF-8 bytes per UTF16 code unit, plus one for the terminating NUL. - // - // NOTE: we need to call resize() but not reserve() because values can't be - // set at positions after length(). - utf8_string.resize(utf16_string.length() * 3 + 1); - - int result_length = 0; - UErrorCode status = U_ZERO_ERROR; - u_strToUTF8(&utf8_string[0], utf8_string.length(), &result_length, - utf16_string.data(), utf16_string.length(), &status); - // Corrects the length - utf8_string.resize(result_length); - - if (U_FAILURE(status)) { - return absl_ports::InternalError("Failed to convert UTF16 string to UTF8"); - } - return utf8_string; -} - -libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16( - std::string_view utf8_string) { - std::u16string utf16_result; - // The UTF16 string won't be longer than its UTF8 format - // - // NOTE: we need to call resize() but not reserve() because values can't be - // set at positions after length(). - utf16_result.resize(utf8_string.length()); - - int result_length = 0; - UErrorCode status = U_ZERO_ERROR; - u_strFromUTF8(&utf16_result[0], utf16_result.length(), &result_length, - utf8_string.data(), utf8_string.length(), &status); - // Corrects the length - utf16_result.resize(result_length); - - if (U_FAILURE(status)) { - return absl_ports::InternalError(absl_ports::StrCat( - "Failed to convert UTF8 string '", utf8_string, "' to UTF16")); - } - return utf16_result; -} - -UChar32 GetUChar32At(const char* data, int length, int position) { - UChar32 uchar32; - U8_NEXT_OR_FFFD(data, position, length, uchar32); - return uchar32; -} - -void SafeTruncateUtf8(std::string* str, int truncate_to_length) { - if (str == nullptr || truncate_to_length >= str->length()) { - return; - } - - while (truncate_to_length > 0) { - if (IsLeadUtf8Byte(str->at(truncate_to_length))) { - str->resize(truncate_to_length); - return; - } - truncate_to_length--; - } - - // Truncates to an empty string - str->resize(0); -} - -bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); } - -bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; } - -int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); } - -bool IsLeadUtf8Byte(char c) { return IsAscii(c) || U8_IS_LEAD((uint8_t)c); } - -bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) { - if (IsAscii(input[position])) { - if (char_len_out != nullptr) { - *char_len_out = 1; - } - return ascii_icu_punctuation.find(input[position]) != std::string::npos; - } - UChar32 c = GetUChar32At(input.data(), input.length(), position); - if (char_len_out != nullptr) { - *char_len_out = U8_LENGTH(c); - } - return u_ispunct(c); -} - -bool IsWhitespaceAt(std::string_view input, int position) { - if (IsAscii(input[position])) { - return std::isspace(input[position]); - } - UChar32 c = GetUChar32At(input.data(), input.length(), position); - return u_isUWhiteSpace(c); -} - -bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in, - char* char_out) { - if (IsAscii(uchar32_in)) { - // The Unicode character is within ASCII range - if (char_out != nullptr) { - *char_out = uchar32_in; - } - return true; - } - - // Maximum number of pieces a Unicode character can be decomposed into. - // TODO(samzheng) figure out if this number is proper. - constexpr int kDecompositionBufferCapacity = 5; - - // A buffer used to store Unicode decomposition mappings of only one - // character. - UChar decomposition_buffer[kDecompositionBufferCapacity]; - - // Decomposes the Unicode character, trying to get an ASCII char and some - // diacritic chars. - UErrorCode status = U_ZERO_ERROR; - if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0], - kDecompositionBufferCapacity, &status) > 0 && - !U_FAILURE(status) && icu_i18n_utils::IsAscii(decomposition_buffer[0])) { - if (char_out != nullptr) { - *char_out = decomposition_buffer[0]; - } - return true; - } - return false; -} - -} // namespace icu_i18n_utils -} // namespace lib -} // namespace icing diff --git a/icing/util/icu-i18n-utils.h b/icing/util/icu-i18n-utils.h deleted file mode 100644 index 4d29cf0..0000000 --- a/icing/util/icu-i18n-utils.h +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_UTIL_ICU_I18N_UTILS_H_ -#define ICING_UTIL_ICU_I18N_UTILS_H_ - -#include <string> -#include <string_view> - -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "unicode/umachine.h" -#include "unicode/unorm2.h" - -// Rely on this to transitively have access to U8_MAX_LENGTH, so all users can -// depend on either icu-i18n-utils or i18n-utils. -#include "unicode/utf8.h" - -// Rely on this to transitively have access to ULOC_US, so all users can depend -// on either icu-i18n-utils or i18n-utils. -#include "unicode/uloc.h" - -namespace icing { -namespace lib { - -// Internationalization utils that use ICU methods under the hood. For use when -// the library is guaranteed to have access to ICU. -namespace icu_i18n_utils { - -// An invalid value defined by Unicode. -static constexpr UChar32 kInvalidUChar32 = 0xFFFD; - -// Converts a UTF16 string to a UTF8 string. -// -// Returns: -// A UTF8 string on success -// INTERNAL_ERROR on any failures -libtextclassifier3::StatusOr<std::string> Utf16ToUtf8( - const std::u16string& utf16_string); - -// Converts a UTF8 string to a UTF16 string. -// -// Returns: -// A UTF16 string on success -// INTERNAL_ERROR on any failures -libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16( - std::string_view utf8_string); - -// Returns the Unicode char at the given position. If anything wrong happens, an -// invalid value 0xFFFD is returned. -UChar32 GetUChar32At(const char* data, int length, int position); - -// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut -// in the middle. The string will be truncated in place. -void SafeTruncateUtf8(std::string* str, int truncate_to_length); - -// Checks if the single char is within ASCII range. -bool IsAscii(char c); - -// Checks if the Unicode char is within ASCII range. -bool IsAscii(UChar32 c); - -// Returns how many code units (bytes) are used for the UTF-8 encoding of this -// Unicode character. Returns 0 if not valid. -int GetUtf8Length(UChar32 c); - -// Checks if the single char is the first byte of a UTF8 character, note -// that a single ASCII char is also considered a lead byte. -bool IsLeadUtf8Byte(char c); - -// Checks if the character at position is punctuation. Assigns the length of the -// character at position to *char_len_out if the character at position is valid -// punctuation and char_len_out is not null. -bool IsPunctuationAt(std::string_view input, int position, - int* char_len_out = nullptr); - -// Checks if the character at position is a whitespace. -bool IsWhitespaceAt(std::string_view input, int position); - -// Transforms a Unicode character with diacritics to its counterpart in ASCII -// range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if -// the transformation is successful. -// -// NOTE: According to our convention this function should have returned -// StatusOr<char>. However, this function is performance-sensitive because is -// could be called on every Latin character in normalization, so we make it -// return a bool here to save a bit more time and memory. -bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in, - char* char_out); - -} // namespace icu_i18n_utils -} // namespace lib -} // namespace icing - -#endif // ICING_UTIL_ICU_I18N_UTILS_H_ |