diff options
author | My Name <dsaadati@google.com> | 2021-09-09 11:11:03 -0700 |
---|---|---|
committer | Dan Saadati <dsaadati@google.com> | 2021-09-09 11:27:04 -0700 |
commit | 34fc8c85b9f690ffd0a095a4bbcac9aaacfa387b (patch) | |
tree | 0b9bbc9ca107551327b396e7c355b65c4baa43ef | |
parent | 14ee9a8eb8f3ed47f68117208626045878c943ac (diff) | |
parent | 39f59853b980d94a55e9b0f76185b0d3fff88455 (diff) | |
download | icing-34fc8c85b9f690ffd0a095a4bbcac9aaacfa387b.tar.gz |
Merge remote-tracking branch 'aosp/upstream-master' into androidx-main
* aosp/upstream-master:
Sync from upstream.
Descriptions:
================
Remove no-longer-used write paths for file-backed-proto-log.
================
Modify segmentation rules to consider any segment that begins with a non-Ascii
alphanumeric character as valid
=================
Implement CalculateNormalizedMatchLength for IcuNormalizer.
================
Add additional benchmark cases that were useful in developing
submatching and CalculateNormalizedMatchLength for IcuNormalizer
=================
Switch NormalizationMap from
static const std::unordered_map<char16_t, char16_t>& to
static const std::unordered_map<char16_t, char16_t> *const.
==================
Bug: 147509515
Bug: 149610413
Bug: 195720764
Bug: 196257995
Change-Id: I8e8d7a7fcceb8eaae1fdcb45a92ea4399d47f343
27 files changed, 665 insertions, 1320 deletions
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h index b2b37e8..cf16b4f 100644 --- a/icing/file/file-backed-proto-log.h +++ b/icing/file/file-backed-proto-log.h @@ -14,16 +14,14 @@ // File-backed log of protos with append-only writes and position based reads. // -// There should only be one instance of a FileBackedProtoLog of the same file at -// a time; using multiple instances at the same time may lead to undefined -// behavior. +// The implementation in this file is deprecated and replaced by +// portable-file-backed-proto-log.h. // -// The entire checksum is computed on initialization to verify the contents are -// valid. On failure, the log will be truncated to the last verified state when -// PersistToDisk() was called. If the log cannot successfully restore the last -// state due to disk corruption or some other inconsistency, then the entire log -// will be lost. +// This deprecated implementation has been made read-only for the purposes of +// migration; writing and erasing this format of log is no longer supported and +// the methods to accomplish this have been removed. // +// The details of this format follow below: // Each proto written to the file will have a metadata written just before it. // The metadata consists of // { @@ -31,37 +29,16 @@ // 3 bytes of the proto size // n bytes of the proto itself // } -// -// Example usage: -// ICING_ASSERT_OK_AND_ASSIGN(auto create_result, -// FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_, -// options)); -// auto proto_log = create_result.proto_log; -// -// Document document; -// document.set_namespace("com.google.android.example"); -// document.set_uri("www.google.com"); -// -// int64_t document_offset = proto_log->WriteProto(document)); -// Document same_document = proto_log->ReadProto(document_offset)); -// proto_log->PersistToDisk(); -// // TODO(b/136514769): Add versioning to the header and a UpgradeToVersion // migration method. - #ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_ #define ICING_FILE_FILE_BACKED_PROTO_LOG_H_ -#include <cstddef> #include <cstdint> -#include <cstring> #include <memory> #include <string> #include <string_view> -#include <utility> -#include <vector> -#include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include <google/protobuf/io/gzip_stream.h> #include <google/protobuf/io/zero_copy_stream_impl_lite.h> @@ -112,10 +89,6 @@ class FileBackedProtoLog { // Header stored at the beginning of the file before the rest of the log // contents. Stores metadata on the log. - // - // TODO(b/139375388): Migrate the Header struct to a proto. This makes - // migrations easier since we don't need to worry about different size padding - // (which would affect the checksum) and different endians. struct Header { static constexpr int32_t kMagic = 0xf4c6f67a; @@ -195,20 +168,6 @@ class FileBackedProtoLog { FileBackedProtoLog(const FileBackedProtoLog&) = delete; FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete; - // This will update the checksum of the log as well. - ~FileBackedProtoLog(); - - // Writes the serialized proto to the underlying file. Writes are applied - // directly to the underlying file. Users do not need to sync the file after - // writing. - // - // Returns: - // Offset of the newly appended proto in file on success - // INVALID_ARGUMENT if proto is too large, as decided by - // Options.max_proto_size - // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto); - // Reads out a proto located at file_offset from the file. // // Returns: @@ -218,31 +177,6 @@ class FileBackedProtoLog { // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const; - // Erases the data of a proto located at file_offset from the file. - // - // Returns: - // OK on success - // OUT_OF_RANGE_ERROR if file_offset exceeds file size - // INTERNAL_ERROR on IO error - libtextclassifier3::Status EraseProto(int64_t file_offset); - - // Calculates and returns the disk usage in bytes. Rounds up to the nearest - // block size. - // - // Returns: - // Disk usage on success - // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; - - // Returns the file size of all the elements held in the log. File size is in - // bytes. This excludes the size of any internal metadata of the log, e.g. the - // log's header. - // - // Returns: - // File size on success - // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; - // An iterator helping to find offsets of all the protos in file. // Example usage: // @@ -281,72 +215,6 @@ class FileBackedProtoLog { // behaviors could happen. Iterator GetIterator(); - // Persists all changes since initialization or the last call to - // PersistToDisk(). Any changes that aren't persisted may be lost if the - // system fails to close safely. - // - // Example use case: - // - // Document document; - // document.set_namespace("com.google.android.example"); - // document.set_uri("www.google.com"); - // - // { - // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, - // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path, - // options)); - // auto proto_log = std::move(create_result.proto_log); - // - // int64_t document_offset = proto_log->WriteProto(document)); - // - // // We lose the document here since it wasn't persisted. - // // *SYSTEM CRASH* - // } - // - // { - // // Can still successfully create after a crash since the log can - // // rewind/truncate to recover into a previously good state - // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, - // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path, - // options)); - // auto proto_log = std::move(create_result.proto_log); - // - // // Lost the proto since we didn't PersistToDisk before the crash - // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error - // - // int64_t document_offset = proto_log->WriteProto(document)); - // - // // Persisted this time, so we should be ok. - // ICING_ASSERT_OK(proto_log->PersistToDisk()); - // } - // - // { - // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, - // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path, - // options)); - // auto proto_log = std::move(create_result.proto_log); - // - // // SUCCESS - // Document same_document = proto_log->ReadProto(document_offset)); - // } - // - // NOTE: Since all protos are already written to the file directly, this - // just updates the checksum and rewind position. Without these updates, - // future initializations will truncate the file and discard unpersisted - // changes. - // - // Returns: - // OK on success - // INTERNAL_ERROR on IO error - libtextclassifier3::Status PersistToDisk(); - - // Calculates the checksum of the log contents. Excludes the header content. - // - // Returns: - // Crc of the log content - // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); - private: // Object can only be instantiated via the ::Create factory. FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path, @@ -451,15 +319,6 @@ FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem, } template <typename ProtoT> -FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() { - if (!PersistToDisk().ok()) { - ICING_LOG(WARNING) - << "Error persisting to disk during destruction of FileBackedProtoLog: " - << file_path_; - } -} - -template <typename ProtoT> libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult> FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem, const std::string& file_path, @@ -688,79 +547,6 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum( } template <typename ProtoT> -libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto( - const ProtoT& proto) { - int64_t proto_size = proto.ByteSizeLong(); - int32_t metadata; - int metadata_size = sizeof(metadata); - int64_t current_position = filesystem_->GetCurrentPosition(fd_.get()); - - if (proto_size > header_->max_proto_size) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "proto_size, %lld, was too large to write. Max is %d", - static_cast<long long>(proto_size), header_->max_proto_size)); - } - - // At this point, we've guaranteed that proto_size is under kMaxProtoSize - // (see - // ::Create), so we can safely store it in an int. - int final_size = 0; - - std::string proto_str; - google::protobuf::io::StringOutputStream proto_stream(&proto_str); - - if (header_->compress) { - google::protobuf::io::GzipOutputStream::Options options; - options.format = google::protobuf::io::GzipOutputStream::ZLIB; - options.compression_level = kDeflateCompressionLevel; - - google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream, - options); - - bool success = proto.SerializeToZeroCopyStream(&compressing_stream) && - compressing_stream.Close(); - - if (!success) { - return absl_ports::InternalError("Error compressing proto."); - } - - final_size = proto_str.size(); - - // In case the compressed proto is larger than the original proto, we also - // can't write it. - if (final_size > header_->max_proto_size) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Compressed proto size, %d, was greater than " - "max_proto_size, %d", - final_size, header_->max_proto_size)); - } - } else { - // Serialize the proto directly into the write buffer at an offset of the - // metadata. - proto.SerializeToZeroCopyStream(&proto_stream); - final_size = proto_str.size(); - } - - // 1st byte for magic, next 3 bytes for proto size. - metadata = (kProtoMagic << 24) | final_size; - - // Actually write metadata, has to be done after we know the possibly - // compressed proto size - if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) { - return absl_ports::InternalError( - absl_ports::StrCat("Failed to write proto metadata to: ", file_path_)); - } - - // Write the serialized proto - if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) { - return absl_ports::InternalError( - absl_ports::StrCat("Failed to write proto to: ", file_path_)); - } - - return current_position; -} - -template <typename ProtoT> libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto( int64_t file_offset) const { int64_t file_size = filesystem_->GetFileSize(fd_.get()); @@ -806,83 +592,6 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto( } template <typename ProtoT> -libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto( - int64_t file_offset) { - int64_t file_size = filesystem_->GetFileSize(fd_.get()); - if (file_offset >= file_size) { - // file_size points to the next byte to write at, so subtract one to get - // the inclusive, actual size of file. - return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( - "Trying to erase data at a location, %lld, " - "out of range of the file size, %lld", - static_cast<long long>(file_offset), - static_cast<long long>(file_size - 1))); - } - - MemoryMappedFile mmapped_file( - *filesystem_, file_path_, - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); - - // Read out the metadata - ICING_ASSIGN_OR_RETURN( - int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size)); - - ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), - GetProtoSize(metadata))); - - // We need to update the crc checksum if the erased area is before the - // rewind position. - if (file_offset + sizeof(metadata) < header_->rewind_offset) { - // We need to calculate [original string xor 0s]. - // The xored string is the same as the original string because 0 xor 0 = - // 0, 1 xor 0 = 1. - const std::string_view xored_str(mmapped_file.region(), - mmapped_file.region_size()); - - Crc32 crc(header_->log_checksum); - ICING_ASSIGN_OR_RETURN( - uint32_t new_crc, - crc.UpdateWithXor( - xored_str, - /*full_data_size=*/header_->rewind_offset - sizeof(Header), - /*position=*/file_offset + sizeof(metadata) - sizeof(Header))); - - header_->log_checksum = new_crc; - header_->header_checksum = header_->CalculateHeaderChecksum(); - - if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), - sizeof(Header))) { - return absl_ports::InternalError( - absl_ports::StrCat("Failed to update header to: ", file_path_)); - } - } - - memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size()); - return libtextclassifier3::Status::OK; -} - -template <typename ProtoT> -libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage() - const { - int64_t size = filesystem_->GetDiskUsage(file_path_.c_str()); - if (size == Filesystem::kBadFileSize) { - return absl_ports::InternalError("Failed to get disk usage of proto log"); - } - return size; -} - -template <typename ProtoT> -libtextclassifier3::StatusOr<int64_t> -FileBackedProtoLog<ProtoT>::GetElementsFileSize() const { - int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str()); - if (total_file_size == Filesystem::kBadFileSize) { - return absl_ports::InternalError( - "Failed to get file size of elments in the proto log"); - } - return total_file_size - sizeof(Header); -} - -template <typename ProtoT> FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem, const std::string& file_path, int64_t initial_offset) @@ -964,51 +673,6 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata( return metadata; } -template <typename ProtoT> -libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() { - int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); - if (file_size == header_->rewind_offset) { - // No new protos appended, don't need to update the checksum. - return libtextclassifier3::Status::OK; - } - - int64_t new_content_size = file_size - header_->rewind_offset; - Crc32 crc; - if (new_content_size < 0) { - // File shrunk, recalculate the entire checksum. - ICING_ASSIGN_OR_RETURN( - crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header), - file_size)); - } else { - // Append new changes to the existing checksum. - ICING_ASSIGN_OR_RETURN( - crc, - ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum), - header_->rewind_offset, file_size)); - } - - header_->log_checksum = crc.Get(); - header_->rewind_offset = file_size; - header_->header_checksum = header_->CalculateHeaderChecksum(); - - if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), - sizeof(Header)) || - !filesystem_->DataSync(fd_.get())) { - return absl_ports::InternalError( - absl_ports::StrCat("Failed to update header to: ", file_path_)); - } - - return libtextclassifier3::Status::OK; -} - -template <typename ProtoT> -libtextclassifier3::StatusOr<Crc32> -FileBackedProtoLog<ProtoT>::ComputeChecksum() { - return FileBackedProtoLog<ProtoT>::ComputeChecksum( - filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header), - /*end=*/filesystem_->GetFileSize(file_path_.c_str())); -} - } // namespace lib } // namespace icing diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc deleted file mode 100644 index c09fd5a..0000000 --- a/icing/file/file-backed-proto-log_benchmark.cc +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <cstdint> -#include <random> - -#include "testing/base/public/benchmark.h" -#include "gmock/gmock.h" -#include "icing/document-builder.h" -#include "icing/file/file-backed-proto-log.h" -#include "icing/file/filesystem.h" -#include "icing/legacy/core/icing-string-util.h" -#include "icing/proto/document.pb.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/random-string.h" -#include "icing/testing/tmp-directory.h" - -// go/microbenchmarks -// -// To build and run on a local machine: -// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt -// icing/file:file-backed-proto-log_benchmark -// -// $ blaze-bin/icing/file/file-backed-proto-log_benchmark -// --benchmarks=all -// -// -// To build and run on an Android device (must be connected and rooted): -// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" -// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt -// icing/file:file-backed-proto-log_benchmark -// -// $ adb root -// -// $ adb push -// blaze-bin/icing/file/file-backed-proto-log_benchmark -// /data/local/tmp/ -// -// $ adb shell /data/local/tmp/file-backed-proto-log-benchmark -// --benchmarks=all - -namespace icing { -namespace lib { - -namespace { - -static void BM_Write(benchmark::State& state) { - const Filesystem filesystem; - int string_length = state.range(0); - const std::string file_path = IcingStringUtil::StringPrintf( - "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log"); - int max_proto_size = (1 << 24) - 1; // 16 MiB - bool compress = true; - - // Make sure it doesn't already exist. - filesystem.DeleteFile(file_path.c_str()); - - auto proto_log = - FileBackedProtoLog<DocumentProto>::Create( - &filesystem, file_path, - FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) - .ValueOrDie() - .proto_log; - - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - - std::default_random_engine random; - const std::string rand_str = - RandomString(kAlNumAlphabet, string_length, &random); - - auto document_properties = document.add_properties(); - document_properties->set_name("string property"); - document_properties->add_string_values(rand_str); - - for (auto _ : state) { - testing::DoNotOptimize(proto_log->WriteProto(document)); - } - state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * - string_length); - - // Cleanup after ourselves - filesystem.DeleteFile(file_path.c_str()); -} -BENCHMARK(BM_Write) - ->Arg(1) - ->Arg(32) - ->Arg(512) - ->Arg(1024) - ->Arg(4 * 1024) - ->Arg(8 * 1024) - ->Arg(16 * 1024) - ->Arg(32 * 1024) - ->Arg(256 * 1024) - ->Arg(2 * 1024 * 1024) - ->Arg(8 * 1024 * 1024) - ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is - // 16MiB, and we need some extra space for the - // rest of the document properties - -static void BM_Read(benchmark::State& state) { - const Filesystem filesystem; - int string_length = state.range(0); - const std::string file_path = IcingStringUtil::StringPrintf( - "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log"); - int max_proto_size = (1 << 24) - 1; // 16 MiB - bool compress = true; - - // Make sure it doesn't already exist. - filesystem.DeleteFile(file_path.c_str()); - - auto proto_log = - FileBackedProtoLog<DocumentProto>::Create( - &filesystem, file_path, - FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) - .ValueOrDie() - .proto_log; - - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - - std::default_random_engine random; - const std::string rand_str = - RandomString(kAlNumAlphabet, string_length, &random); - - auto document_properties = document.add_properties(); - document_properties->set_name("string property"); - document_properties->add_string_values(rand_str); - - ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, - proto_log->WriteProto(document)); - - for (auto _ : state) { - testing::DoNotOptimize(proto_log->ReadProto(write_offset)); - } - state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * - string_length); - - // Cleanup after ourselves - filesystem.DeleteFile(file_path.c_str()); -} -BENCHMARK(BM_Read) - ->Arg(1) - ->Arg(32) - ->Arg(512) - ->Arg(1024) - ->Arg(4 * 1024) - ->Arg(8 * 1024) - ->Arg(16 * 1024) - ->Arg(32 * 1024) - ->Arg(256 * 1024) - ->Arg(2 * 1024 * 1024) - ->Arg(8 * 1024 * 1024) - ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is - // 16MiB, and we need some extra space for the - // rest of the document properties - -static void BM_Erase(benchmark::State& state) { - const Filesystem filesystem; - const std::string file_path = IcingStringUtil::StringPrintf( - "%s%s", GetTestTempDir().c_str(), "/proto.log"); - int max_proto_size = (1 << 24) - 1; // 16 MiB - bool compress = true; - - // Make sure it doesn't already exist. - filesystem.DeleteFile(file_path.c_str()); - - auto proto_log = - FileBackedProtoLog<DocumentProto>::Create( - &filesystem, file_path, - FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) - .ValueOrDie() - .proto_log; - - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - - std::default_random_engine random; - const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random); - - auto document_properties = document.add_properties(); - document_properties->set_name("string property"); - document_properties->add_string_values(rand_str); - - for (auto _ : state) { - state.PauseTiming(); - ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, - proto_log->WriteProto(document)); - state.ResumeTiming(); - - testing::DoNotOptimize(proto_log->EraseProto(write_offset)); - } - - // Cleanup after ourselves - filesystem.DeleteFile(file_path.c_str()); -} -BENCHMARK(BM_Erase); - -static void BM_ComputeChecksum(benchmark::State& state) { - const Filesystem filesystem; - const std::string file_path = GetTestTempDir() + "/proto.log"; - int max_proto_size = (1 << 24) - 1; // 16 MiB - bool compress = true; - - // Make sure it doesn't already exist. - filesystem.DeleteFile(file_path.c_str()); - - auto proto_log = - FileBackedProtoLog<DocumentProto>::Create( - &filesystem, file_path, - FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) - .ValueOrDie() - .proto_log; - - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - - // Make each document 1KiB - int string_length = 1024; - std::default_random_engine random; - const std::string rand_str = - RandomString(kAlNumAlphabet, string_length, &random); - - auto document_properties = document.add_properties(); - document_properties->set_name("string property"); - document_properties->add_string_values(rand_str); - - int num_docs = state.range(0); - for (int i = 0; i < num_docs; ++i) { - ICING_ASSERT_OK(proto_log->WriteProto(document)); - } - - for (auto _ : state) { - testing::DoNotOptimize(proto_log->ComputeChecksum()); - } - - // Cleanup after ourselves - filesystem.DeleteFile(file_path.c_str()); -} -BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20); - -} // namespace -} // namespace lib -} // namespace icing diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc index d429277..eccb0c7 100644 --- a/icing/file/file-backed-proto-log_test.cc +++ b/icing/file/file-backed-proto-log_test.cc @@ -19,10 +19,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/file/mock-filesystem.h" -#include "icing/portable/equals-proto.h" #include "icing/proto/document.pb.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -32,14 +29,7 @@ namespace lib { namespace { -using ::icing::lib::portable_equals_proto::EqualsProto; -using ::testing::A; -using ::testing::Eq; -using ::testing::Gt; -using ::testing::Not; using ::testing::NotNull; -using ::testing::Pair; -using ::testing::Return; class FileBackedProtoLogTest : public ::testing::Test { protected: @@ -87,193 +77,6 @@ TEST_F(FileBackedProtoLogTest, Initialize) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) { - int max_proto_size = 1; - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - - // Proto is too large for the max_proto_size_in - ASSERT_THAT(proto_log->WriteProto(document), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) { - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Write a proto - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - - ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset, - proto_log->WriteProto(document)); - - // The 4 bytes of metadata that just doesn't have the same kProtoMagic - // specified in file-backed-proto-log.h - uint32_t wrong_magic = 0x7E000000; - - // Sanity check that we opened the file correctly - int fd = filesystem_.OpenForWrite(file_path_.c_str()); - ASSERT_GT(fd, 0); - - // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of - // a proto entry. - filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic)); - - ASSERT_THAT(proto_log->ReadProto(file_offset), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); -} - -TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) { - int last_offset; - { - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options( - /*compress_in=*/false, max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Write the first proto - DocumentProto document1 = - DocumentBuilder().SetKey("namespace1", "uri1").Build(); - - ICING_ASSERT_OK_AND_ASSIGN(int written_position, - proto_log->WriteProto(document1)); - - int document1_offset = written_position; - - // Check that what we read is what we wrote - ASSERT_THAT(proto_log->ReadProto(written_position), - IsOkAndHolds(EqualsProto(document1))); - - // Write a second proto that's close to the max size. Leave some room for - // the rest of the proto properties. - std::string long_str(max_proto_size_ - 1024, 'a'); - DocumentProto document2 = DocumentBuilder() - .SetKey("namespace2", "uri2") - .AddStringProperty("long_str", long_str) - .Build(); - - ICING_ASSERT_OK_AND_ASSIGN(written_position, - proto_log->WriteProto(document2)); - - int document2_offset = written_position; - last_offset = written_position; - ASSERT_GT(document2_offset, document1_offset); - - // Check the second proto - ASSERT_THAT(proto_log->ReadProto(written_position), - IsOkAndHolds(EqualsProto(document2))); - - ICING_ASSERT_OK(proto_log->PersistToDisk()); - } - - { - // Make a new proto_log with the same file_path, and make sure we - // can still write to the same underlying file. - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options( - /*compress_in=*/false, max_proto_size_))); - auto recreated_proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Write a third proto - DocumentProto document3 = - DocumentBuilder().SetKey("namespace3", "uri3").Build(); - - ASSERT_THAT(recreated_proto_log->WriteProto(document3), - IsOkAndHolds(Gt(last_offset))); - } -} - -TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) { - int last_offset; - - { - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options( - /*compress_in=*/true, max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Write the first proto - DocumentProto document1 = - DocumentBuilder().SetKey("namespace1", "uri1").Build(); - - ICING_ASSERT_OK_AND_ASSIGN(int written_position, - proto_log->WriteProto(document1)); - - int document1_offset = written_position; - - // Check that what we read is what we wrote - ASSERT_THAT(proto_log->ReadProto(written_position), - IsOkAndHolds(EqualsProto(document1))); - - // Write a second proto that's close to the max size. Leave some room for - // the rest of the proto properties. - std::string long_str(max_proto_size_ - 1024, 'a'); - DocumentProto document2 = DocumentBuilder() - .SetKey("namespace2", "uri2") - .AddStringProperty("long_str", long_str) - .Build(); - - ICING_ASSERT_OK_AND_ASSIGN(written_position, - proto_log->WriteProto(document2)); - - int document2_offset = written_position; - last_offset = written_position; - ASSERT_GT(document2_offset, document1_offset); - - // Check the second proto - ASSERT_THAT(proto_log->ReadProto(written_position), - IsOkAndHolds(EqualsProto(document2))); - - ICING_ASSERT_OK(proto_log->PersistToDisk()); - } - - { - // Make a new proto_log with the same file_path, and make sure we - // can still write to the same underlying file. - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options( - /*compress_in=*/true, max_proto_size_))); - auto recreated_proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Write a third proto - DocumentProto document3 = - DocumentBuilder().SetKey("namespace3", "uri3").Build(); - - ASSERT_THAT(recreated_proto_log->WriteProto(document3), - IsOkAndHolds(Gt(last_offset))); - } -} - TEST_F(FileBackedProtoLogTest, CorruptHeader) { { ICING_ASSERT_OK_AND_ASSIGN( @@ -303,382 +106,6 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) { } } -TEST_F(FileBackedProtoLogTest, CorruptContent) { - { - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.has_data_loss()); - - DocumentProto document = - DocumentBuilder().SetKey("namespace1", "uri1").Build(); - - // Write and persist an document. - ICING_ASSERT_OK_AND_ASSIGN(int document_offset, - proto_log->WriteProto(document)); - ICING_ASSERT_OK(proto_log->PersistToDisk()); - - // "Corrupt" the content written in the log. - document.set_uri("invalid"); - std::string serialized_document = document.SerializeAsString(); - filesystem_.PWrite(file_path_.c_str(), document_offset, - serialized_document.data(), serialized_document.size()); - } - - { - // We can recover, but we have data loss. - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_TRUE(create_result.has_data_loss()); - ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); - - // Lost everything in the log since the rewind position doesn't help if - // there's been data corruption within the persisted region - ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), - sizeof(FileBackedProtoLog<DocumentProto>::Header)); - } -} - -TEST_F(FileBackedProtoLogTest, PersistToDisk) { - DocumentProto document1 = - DocumentBuilder().SetKey("namespace1", "uri1").Build(); - DocumentProto document2 = - DocumentBuilder().SetKey("namespace2", "uri2").Build(); - int document1_offset, document2_offset; - int log_size; - - { - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Write and persist the first proto - ICING_ASSERT_OK_AND_ASSIGN(document1_offset, - proto_log->WriteProto(document1)); - ICING_ASSERT_OK(proto_log->PersistToDisk()); - - // Write, but don't explicitly persist the second proto - ICING_ASSERT_OK_AND_ASSIGN(document2_offset, - proto_log->WriteProto(document2)); - - // Check that what we read is what we wrote - ASSERT_THAT(proto_log->ReadProto(document1_offset), - IsOkAndHolds(EqualsProto(document1))); - ASSERT_THAT(proto_log->ReadProto(document2_offset), - IsOkAndHolds(EqualsProto(document2))); - - log_size = filesystem_.GetFileSize(file_path_.c_str()); - ASSERT_GT(log_size, 0); - } - - { - // The header rewind position and checksum aren't updated in this "system - // crash" scenario. - - std::string bad_proto = - "some incomplete proto that we didn't finish writing before the system " - "crashed"; - filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(), - bad_proto.size()); - - // Double check that we actually wrote something to the underlying file - ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size); - } - - { - // We can recover, but we have data loss - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_TRUE(create_result.has_data_loss()); - ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL)); - - // Check that everything was persisted across instances - ASSERT_THAT(proto_log->ReadProto(document1_offset), - IsOkAndHolds(EqualsProto(document1))); - ASSERT_THAT(proto_log->ReadProto(document2_offset), - IsOkAndHolds(EqualsProto(document2))); - - // We correctly rewound to the last good state. - ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str())); - } -} - -TEST_F(FileBackedProtoLogTest, Iterator) { - DocumentProto document1 = - DocumentBuilder().SetKey("namespace", "uri1").Build(); - DocumentProto document2 = - DocumentBuilder().SetKey("namespace", "uri2").Build(); - - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - { - // Empty iterator - auto iterator = proto_log->GetIterator(); - ASSERT_THAT(iterator.Advance(), - StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); - } - - { - // Iterates through some documents - ICING_ASSERT_OK(proto_log->WriteProto(document1)); - ICING_ASSERT_OK(proto_log->WriteProto(document2)); - auto iterator = proto_log->GetIterator(); - // 1st proto - ICING_ASSERT_OK(iterator.Advance()); - ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()), - IsOkAndHolds(EqualsProto(document1))); - // 2nd proto - ICING_ASSERT_OK(iterator.Advance()); - ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()), - IsOkAndHolds(EqualsProto(document2))); - // Tries to advance - ASSERT_THAT(iterator.Advance(), - StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); - } - - { - // Iterator with bad filesystem - MockFilesystem mock_filesystem; - ON_CALL(mock_filesystem, GetFileSize(A<const char *>())) - .WillByDefault(Return(Filesystem::kBadFileSize)); - FileBackedProtoLog<DocumentProto>::Iterator bad_iterator( - mock_filesystem, file_path_, /*initial_offset=*/0); - ASSERT_THAT(bad_iterator.Advance(), - StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); - } -} - -TEST_F(FileBackedProtoLogTest, ComputeChecksum) { - DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); - Crc32 checksum; - - { - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - ICING_EXPECT_OK(proto_log->WriteProto(document)); - - ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum()); - - // Calling it twice with no changes should get us the same checksum - EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); - } - - { - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Checksum should be consistent across instances - EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); - - // PersistToDisk shouldn't affect the checksum value - ICING_EXPECT_OK(proto_log->PersistToDisk()); - EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); - - // Check that modifying the log leads to a different checksum - ICING_EXPECT_OK(proto_log->WriteProto(document)); - EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum)))); - } -} - -TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) { - DocumentProto document1 = - DocumentBuilder().SetKey("namespace", "uri1").Build(); - - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Writes and erases proto - ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, - proto_log->WriteProto(document1)); - ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); - - // Checks if the erased area is set to 0. - int64_t file_size = filesystem_.GetFileSize(file_path_.c_str()); - MemoryMappedFile mmapped_file(filesystem_, file_path_, - MemoryMappedFile::Strategy::READ_ONLY); - - // document1_offset + sizeof(int) is the start byte of the proto where - // sizeof(int) is the size of the proto metadata. - mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1); - for (size_t i = 0; i < mmapped_file.region_size(); ++i) { - ASSERT_THAT(mmapped_file.region()[i], Eq(0)); - } -} - -TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) { - DocumentProto document1 = - DocumentBuilder().SetKey("namespace", "uri1").Build(); - DocumentProto document2 = - DocumentBuilder().SetKey("namespace", "uri2").Build(); - - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Writes 2 protos - ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, - proto_log->WriteProto(document1)); - ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset, - proto_log->WriteProto(document2)); - - // Erases the first proto - ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); - - // The first proto has been erased. - ASSERT_THAT(proto_log->ReadProto(document1_offset), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - // The second proto should be returned. - ASSERT_THAT(proto_log->ReadProto(document2_offset), - IsOkAndHolds(EqualsProto(document2))); -} - -TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { - DocumentProto document1 = - DocumentBuilder().SetKey("namespace", "uri1").Build(); - DocumentProto document2 = - DocumentBuilder().SetKey("namespace", "uri2").Build(); - DocumentProto document3 = - DocumentBuilder().SetKey("namespace", "uri3").Build(); - DocumentProto document4 = - DocumentBuilder().SetKey("namespace", "uri4").Build(); - - int64_t document2_offset; - int64_t document3_offset; - - { - // Erase data after the rewind position. This won't update the checksum - // immediately. - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Writes 3 protos - ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, - proto_log->WriteProto(document1)); - ICING_ASSERT_OK_AND_ASSIGN(document2_offset, - proto_log->WriteProto(document2)); - ICING_ASSERT_OK_AND_ASSIGN(document3_offset, - proto_log->WriteProto(document3)); - - // Erases the 1st proto, checksum won't be updated immediately because the - // rewind position is 0. - ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); - - EXPECT_THAT(proto_log->ComputeChecksum(), - IsOkAndHolds(Eq(Crc32(2293202502)))); - } // New checksum is updated in destructor. - - { - // Erase data before the rewind position. This will update the checksum - // immediately. - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Erases the 2nd proto that is now before the rewind position. Checksum is - // updated. - ICING_ASSERT_OK(proto_log->EraseProto(document2_offset)); - - EXPECT_THAT(proto_log->ComputeChecksum(), - IsOkAndHolds(Eq(Crc32(639634028)))); - } - - { - // Append data and erase data before the rewind position. This will update - // the checksum twice: in EraseProto() and destructor. - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.has_data_loss()); - - // Append a new document which is after the rewind position. - ICING_ASSERT_OK(proto_log->WriteProto(document4)); - - // Erases the 3rd proto that is now before the rewind position. Checksum is - // updated. - ICING_ASSERT_OK(proto_log->EraseProto(document3_offset)); - - EXPECT_THAT(proto_log->ComputeChecksum(), - IsOkAndHolds(Eq(Crc32(1990198693)))); - } // Checksum is updated with the newly appended document. - - { - // A successful creation means that the checksum matches. - ICING_ASSERT_OK_AND_ASSIGN( - FileBackedProtoLog<DocumentProto>::CreateResult create_result, - FileBackedProtoLog<DocumentProto>::Create( - &filesystem_, file_path_, - FileBackedProtoLog<DocumentProto>::Options(compress_, - max_proto_size_))); - auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.has_data_loss()); - } -} - } // namespace } // namespace lib } // namespace icing diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc index ba9aed1..5e610d5 100644 --- a/icing/icing-search-engine_benchmark.cc +++ b/icing/icing-search-engine_benchmark.cc @@ -43,7 +43,6 @@ #include "icing/testing/common-matchers.h" #include "icing/testing/document-generator.h" #include "icing/testing/random-string.h" -#include "icing/testing/recorder-test-utils.h" #include "icing/testing/schema-generator.h" #include "icing/testing/tmp-directory.h" @@ -178,12 +177,12 @@ class DestructibleDirectory { }; std::vector<DocumentProto> GenerateRandomDocuments( - EvenDistributionTypeSelector* type_selector, int num_docs) { + EvenDistributionTypeSelector* type_selector, int num_docs, + const std::vector<std::string>& language) { std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces); EvenDistributionNamespaceSelector namespace_selector(namespaces); std::default_random_engine random; - std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); UniformDistributionLanguageTokenGenerator<std::default_random_engine> token_generator(language, &random); @@ -227,8 +226,9 @@ void BM_IndexLatency(benchmark::State& state) { ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); int num_docs = state.range(0); + std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); const std::vector<DocumentProto> random_docs = - GenerateRandomDocuments(&type_selector, num_docs); + GenerateRandomDocuments(&type_selector, num_docs, language); Timer timer; for (const DocumentProto& doc : random_docs) { ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk()); @@ -271,6 +271,56 @@ BENCHMARK(BM_IndexLatency) ->ArgPair(1 << 15, 10) ->ArgPair(1 << 17, 10); +void BM_QueryLatency(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + std::default_random_engine random; + int num_types = kAvgNumNamespaces * kAvgNumTypes; + ExactStringPropertyGenerator property_generator; + SchemaGenerator<ExactStringPropertyGenerator> schema_generator( + /*num_properties=*/state.range(1), &property_generator); + SchemaProto schema = schema_generator.GenerateSchema(num_types); + EvenDistributionTypeSelector type_selector(schema); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + int num_docs = state.range(0); + std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); + const std::vector<DocumentProto> random_docs = + GenerateRandomDocuments(&type_selector, num_docs, language); + for (const DocumentProto& doc : random_docs) { + ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk()); + } + + SearchSpecProto search_spec = CreateSearchSpec( + language.at(0), std::vector<std::string>(), TermMatchType::PREFIX); + ResultSpecProto result_spec = CreateResultSpec(1000000, 1000000, 1000000); + ScoringSpecProto scoring_spec = + CreateScoringSpec(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); + for (auto _ : state) { + SearchResultProto results = icing->Search( + search_spec, ScoringSpecProto::default_instance(), result_spec); + } +} +BENCHMARK(BM_QueryLatency) + // Arguments: num_indexed_documents, num_sections + ->ArgPair(32, 2) + ->ArgPair(128, 2) + ->ArgPair(1 << 10, 2) + ->ArgPair(1 << 13, 2); + void BM_IndexThroughput(benchmark::State& state) { // Initialize the filesystem std::string test_dir = GetTestTempDir() + "/icing/benchmark"; @@ -297,8 +347,9 @@ void BM_IndexThroughput(benchmark::State& state) { ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); int num_docs = state.range(0); + std::vector<std::string> language = CreateLanguages(kLanguageSize, &random); const std::vector<DocumentProto> random_docs = - GenerateRandomDocuments(&type_selector, num_docs); + GenerateRandomDocuments(&type_selector, num_docs, language); for (auto s : state) { for (const DocumentProto& doc : random_docs) { ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk()); diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index 33b343e..c46762e 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -109,7 +109,7 @@ class TokenMatcherExact : public TokenMatcher { } if (itr != unrestricted_query_terms_.end() && itr != restricted_query_terms_.end()) { - return normalizer_.CalculateNormalizedMatchLength(token.text, *itr); + return normalizer_.FindNormalizedMatchEndPosition(token.text, *itr); } return CharacterIterator(token.text, -1, -1, -1); } @@ -135,14 +135,14 @@ class TokenMatcherPrefix : public TokenMatcher { for (const std::string& query_term : unrestricted_query_terms_) { if (query_term.length() <= s.length() && s.compare(0, query_term.length(), query_term) == 0) { - return normalizer_.CalculateNormalizedMatchLength(token.text, + return normalizer_.FindNormalizedMatchEndPosition(token.text, query_term); } } for (const std::string& query_term : restricted_query_terms_) { if (query_term.length() <= s.length() && s.compare(0, query_term.length(), query_term) == 0) { - return normalizer_.CalculateNormalizedMatchLength(token.text, + return normalizer_.FindNormalizedMatchEndPosition(token.text, query_term); } } diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index ad70038..f811941 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -691,10 +691,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) { EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f")); } TEST_F(SnippetRetrieverTest, ExactSnippeting) { @@ -738,9 +735,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { @@ -787,19 +782,15 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { "we need to begin considering our options regarding body bar.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo", "bar")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("foo", "bar")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("foo", "bar")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { @@ -849,10 +840,8 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { "we need to begin considering our options regarding body bar.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo", "bar")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("foo", "bar")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("foo", "bar")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { @@ -903,20 +892,16 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { "Concerning the subject of foo, we need to begin considering our")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("subject", "foo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("subject", "foo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("subject", "foo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), - ElementsAre("subject")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), + ElementsAre("subject")); } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { @@ -960,18 +945,14 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { ElementsAre( "Concerning the subject of foo, we need to begin considering our")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("subject foo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); } TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { @@ -993,9 +974,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD")); } TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { @@ -1020,10 +999,8 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { ElementsAre("Some members are in Zürich.")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("Zürich")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("Zürich")); } TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { @@ -1084,20 +1061,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("polo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), - ElementsAre("polo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetPropertyPaths(snippet), ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]")); @@ -1194,19 +1164,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("polo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), - ElementsAre("polo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT( GetPropertyPaths(snippet), @@ -1309,19 +1273,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("polo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), - ElementsAre("polo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetPropertyPaths(snippet), ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]", @@ -1422,19 +1380,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) { GetString(&document, snippet.entries(0).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), - ElementsAre("polo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo")); EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X")); content = GetString(&document, snippet.entries(1).property_name()); EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), - ElementsAre("polo")); - } + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo")); EXPECT_THAT( GetPropertyPaths(snippet), @@ -1478,16 +1430,12 @@ TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) { // Ensure that the match is correct. EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走")); - } + EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走")); // Ensure that the utf-16 values are also as expected EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3)); EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1)); - } + EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1)); } TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { @@ -1587,16 +1535,12 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) { // Ensure that the match is correct. EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃")); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂")); - } + EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂")); // Ensure that the utf-16 values are also as expected EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5)); EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4)); - if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { - EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); - } + EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); } TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index cb31441..598ede7 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -300,9 +300,10 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), term_start_index_); - // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. - // We know it's an alphabetic term by checking the first unicode character. - if (u_isUAlphabetic(uchar32)) { + // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned. + // We know it's an alphanumeric term by checking the first unicode + // character. + if (i18n_utils::IsAlphaNumeric(uchar32)) { return true; } return false; diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index 01eb7d8..3090087 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -372,6 +372,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) { IsOkAndHolds(ElementsAre("-", "123"))); } +TEST_P(IcuLanguageSegmenterAllLocalesTest, FullWidthNumbers) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"), + IsOkAndHolds(ElementsAre("0123456789"))); +} + TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index 76219b5..b936f2b 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -291,9 +291,12 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return true; } - // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. - // We know it's an alphabetic term by checking the first unicode character. - if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) { + UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), + term_start_.utf8_index()); + // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned. + // We know it's an alphanumeric term by checking the first unicode + // character. + if (i18n_utils::IsAlphaNumeric(uchar32)) { return true; } return false; diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc index b1a8f72..45d6475 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc @@ -366,6 +366,17 @@ TEST_P(ReverseJniLanguageSegmenterTest, Number) { IsOkAndHolds(ElementsAre("-", "123"))); } +TEST_P(ReverseJniLanguageSegmenterTest, FullWidthNumbers) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + + EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"), + IsOkAndHolds(ElementsAre("0", "1", "2", "3", "4", "5", "6", + "7", "8", "9"))); +} + TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, diff --git a/icing/transform/icu/icu-normalizer-factory.cc b/icing/transform/icu/icu-normalizer-factory.cc index 9951325..493aeb5 100644 --- a/icing/transform/icu/icu-normalizer-factory.cc +++ b/icing/transform/icu/icu-normalizer-factory.cc @@ -44,8 +44,6 @@ libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( return IcuNormalizer::Create(max_term_byte_size); } -std::string_view GetNormalizerName() { return IcuNormalizer::kName; } - } // namespace normalizer_factory } // namespace lib diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc index eb0eead..250d6cf 100644 --- a/icing/transform/icu/icu-normalizer.cc +++ b/icing/transform/icu/icu-normalizer.cc @@ -29,6 +29,7 @@ #include "icing/util/status-macros.h" #include "unicode/umachine.h" #include "unicode/unorm2.h" +#include "unicode/ustring.h" #include "unicode/utrans.h" namespace icing { @@ -157,14 +158,18 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2, const std::string_view term) const { std::string result; result.reserve(term.length()); - for (int i = 0; i < term.length(); i++) { - if (i18n_utils::IsAscii(term[i])) { - result.push_back(std::tolower(term[i])); - } else if (i18n_utils::IsLeadUtf8Byte(term[i])) { - UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i); + int current_pos = 0; + while (current_pos < term.length()) { + if (i18n_utils::IsAscii(term[current_pos])) { + result.push_back(std::tolower(term[current_pos])); + ++current_pos; + } else { + UChar32 uchar32 = + i18n_utils::GetUChar32At(term.data(), term.length(), current_pos); if (uchar32 == i18n_utils::kInvalidUChar32) { ICING_LOG(WARNING) << "Unable to get uchar32 from " << term - << " at position" << i; + << " at position" << current_pos; + current_pos += i18n_utils::GetUtf8Length(uchar32); continue; } char ascii_char; @@ -177,8 +182,9 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2, // tokenized. We handle it here in case there're something wrong with // the tokenizers. int utf8_length = i18n_utils::GetUtf8Length(uchar32); - absl_ports::StrAppend(&result, term.substr(i, utf8_length)); + absl_ports::StrAppend(&result, term.substr(current_pos, utf8_length)); } + current_pos += i18n_utils::GetUtf8Length(uchar32); } } @@ -261,5 +267,103 @@ std::string IcuNormalizer::TermTransformer::Transform( return std::move(utf8_term_or).ValueOrDie(); } +CharacterIterator FindNormalizedLatinMatchEndPosition( + const UNormalizer2* normalizer2, std::string_view term, + CharacterIterator char_itr, std::string_view normalized_term) { + CharacterIterator normalized_char_itr(normalized_term); + char ascii_char; + while (char_itr.utf8_index() < term.length() && + normalized_char_itr.utf8_index() < normalized_term.length()) { + UChar32 c = char_itr.GetCurrentChar(); + if (i18n_utils::IsAscii(c)) { + c = std::tolower(c); + } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) { + c = ascii_char; + } + UChar32 normalized_c = normalized_char_itr.GetCurrentChar(); + if (c != normalized_c) { + return char_itr; + } + char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1); + normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1); + } + return char_itr; +} + +CharacterIterator +IcuNormalizer::TermTransformer::FindNormalizedNonLatinMatchEndPosition( + std::string_view term, CharacterIterator char_itr, + std::string_view normalized_term) const { + CharacterIterator normalized_char_itr(normalized_term); + UErrorCode status = U_ZERO_ERROR; + + constexpr int kUtf16CharBufferLength = 6; + UChar c16[kUtf16CharBufferLength]; + int32_t c16_length; + int32_t limit; + + constexpr int kUtf32CharBufferLength = 3; + UChar32 normalized_buffer[kUtf32CharBufferLength]; + int32_t c32_length; + while (char_itr.utf8_index() < term.length() && + normalized_char_itr.utf8_index() < normalized_term.length()) { + UChar32 c = char_itr.GetCurrentChar(); + u_strFromUTF32(c16, kUtf16CharBufferLength, &c16_length, &c, + /*srcLength=*/1, &status); + if (U_FAILURE(status)) { + break; + } + + limit = c16_length; + utrans_transUChars(u_transliterator_, c16, &c16_length, + kUtf16CharBufferLength, + /*start=*/0, &limit, &status); + if (U_FAILURE(status)) { + break; + } + + u_strToUTF32(normalized_buffer, kUtf32CharBufferLength, &c32_length, c16, + c16_length, &status); + if (U_FAILURE(status)) { + break; + } + + for (int i = 0; i < c32_length; ++i) { + UChar32 normalized_c = normalized_char_itr.GetCurrentChar(); + if (normalized_buffer[i] != normalized_c) { + return char_itr; + } + normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1); + } + char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1); + } + if (U_FAILURE(status)) { + // Failed to transform, return its original form. + ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term; + } + return char_itr; +} + +CharacterIterator IcuNormalizer::FindNormalizedMatchEndPosition( + std::string_view term, std::string_view normalized_term) const { + UErrorCode status = U_ZERO_ERROR; + // ICU manages the singleton instance + const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status); + if (U_FAILURE(status)) { + ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance"; + } + + CharacterIterator char_itr(term); + UChar32 first_uchar32 = char_itr.GetCurrentChar(); + if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 && + DiacriticCharToAscii(normalizer2, first_uchar32, /*char_out=*/nullptr)) { + return FindNormalizedLatinMatchEndPosition(normalizer2, term, char_itr, + normalized_term); + } else { + return term_transformer_->FindNormalizedNonLatinMatchEndPosition( + term, char_itr, normalized_term); + } +} + } // namespace lib } // namespace icing diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h index 4442f3b..d4f1ebd 100644 --- a/icing/transform/icu/icu-normalizer.h +++ b/icing/transform/icu/icu-normalizer.h @@ -21,6 +21,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/transform/normalizer.h" +#include "icing/util/character-iterator.h" #include "unicode/unorm2.h" #include "unicode/utrans.h" @@ -39,8 +40,6 @@ namespace lib { // details. class IcuNormalizer : public Normalizer { public: - static constexpr std::string_view kName = "IcuNormalizer"; - // Creates a normalizer with the subcomponents it needs. max_term_byte_size // enforces the max size of text after normalization, text will be truncated // if exceeds the max size. @@ -58,6 +57,17 @@ class IcuNormalizer : public Normalizer { // result in the non-Latin characters not properly being normalized std::string NormalizeTerm(std::string_view term) const override; + // Returns a CharacterIterator pointing to one past the end of the segment of + // term that (once normalized) matches with normalized_term. + // + // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return + // CharacterIterator(u8:4, u16:4, u32:4). + // + // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return + // CharacterIterator(u8:0, u16:0, u32:0). + CharacterIterator FindNormalizedMatchEndPosition( + std::string_view term, std::string_view normalized_term) const override; + private: // A handler class that helps manage the lifecycle of UTransliterator. It's // used in IcuNormalizer to transform terms into the formats we need. @@ -77,6 +87,12 @@ class IcuNormalizer : public Normalizer { // Transforms the text based on our rules described at top of this file std::string Transform(std::string_view term) const; + // Returns a CharacterIterator pointing to one past the end of the segment + // of a non-latin term that (once normalized) matches with normalized_term. + CharacterIterator FindNormalizedNonLatinMatchEndPosition( + std::string_view term, CharacterIterator char_itr, + std::string_view normalized_term) const; + private: explicit TermTransformer(UTransliterator* u_transliterator); diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc index b037538..8d09be2 100644 --- a/icing/transform/icu/icu-normalizer_benchmark.cc +++ b/icing/transform/icu/icu-normalizer_benchmark.cc @@ -161,6 +161,124 @@ BENCHMARK(BM_NormalizeHiragana) ->Arg(2048000) ->Arg(4096000); +void BM_UppercaseSubTokenLength(benchmark::State& state) { + bool run_via_adb = absl::GetFlag(FLAGS_adb); + if (!run_via_adb) { + ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string(state.range(0), 'A'); + std::string normalized_input_string(state.range(0), 'a'); + for (auto _ : state) { + normalizer->FindNormalizedMatchEndPosition(input_string, + normalized_input_string); + } +} +BENCHMARK(BM_UppercaseSubTokenLength) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +void BM_AccentSubTokenLength(benchmark::State& state) { + bool run_via_adb = absl::GetFlag(FLAGS_adb); + if (!run_via_adb) { + ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string; + std::string normalized_input_string; + while (input_string.length() < state.range(0)) { + input_string.append("àáâãā"); + normalized_input_string.append("aaaaa"); + } + + for (auto _ : state) { + normalizer->FindNormalizedMatchEndPosition(input_string, + normalized_input_string); + } +} +BENCHMARK(BM_AccentSubTokenLength) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +void BM_HiraganaSubTokenLength(benchmark::State& state) { + bool run_via_adb = absl::GetFlag(FLAGS_adb); + if (!run_via_adb) { + ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string; + std::string normalized_input_string; + while (input_string.length() < state.range(0)) { + input_string.append("あいうえお"); + normalized_input_string.append("アイウエオ"); + } + + for (auto _ : state) { + normalizer->FindNormalizedMatchEndPosition(input_string, + normalized_input_string); + } +} +BENCHMARK(BM_HiraganaSubTokenLength) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + } // namespace } // namespace lib diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc index f5d20ff..a46fcc7 100644 --- a/icing/transform/icu/icu-normalizer_test.cc +++ b/icing/transform/icu/icu-normalizer_test.cc @@ -231,6 +231,104 @@ TEST_F(IcuNormalizerTest, Truncate) { } } +TEST_F(IcuNormalizerTest, PrefixMatchLength) { + // Verify that FindNormalizedMatchEndPosition will properly find the length of + // the prefix match when given a non-normalized term and a normalized term + // is a prefix of the non-normalized one. + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + // Upper to lower + std::string term = "MDI"; + CharacterIterator match_end = + normalizer->FindNormalizedMatchEndPosition(term, "md"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD")); + + term = "Icing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin")); + + // Full-width + term = "525600"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "525"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); + + term = "FULLWIDTH"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "full"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); + + // Hiragana to Katakana + term = "あいうえお"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); + + term = "かきくけこ"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); + + // Latin accents + term = "Zürich"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); + + term = "après-midi"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); + + term = "Buenos días"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí")); +} + +TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) { + // Verify that FindNormalizedMatchEndPosition will properly find the length of + // the prefix match when given a non-normalized term and a normalized term + // that share a common prefix. + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + // Upper to lower + std::string term = "MDI"; + CharacterIterator match_end = + normalizer->FindNormalizedMatchEndPosition(term, "mgm"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M")); + + term = "Icing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic")); + + // Full-width + term = "525600"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); + + term = "FULLWIDTH"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); + + // Hiragana to Katakana + term = "あいうえお"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); + + term = "かきくけこ"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); + + // Latin accents + term = "Zürich"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); + + term = "après-midi"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); + + term = "días"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día")); +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/transform/map/map-normalizer-factory.cc b/icing/transform/map/map-normalizer-factory.cc index 286b8f6..3bf84b3 100644 --- a/icing/transform/map/map-normalizer-factory.cc +++ b/icing/transform/map/map-normalizer-factory.cc @@ -42,8 +42,6 @@ libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( return std::make_unique<MapNormalizer>(max_term_byte_size); } -std::string_view GetNormalizerName() { return MapNormalizer::kName; } - } // namespace normalizer_factory } // namespace lib diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc index 4ad5dec..95aa633 100644 --- a/icing/transform/map/map-normalizer.cc +++ b/icing/transform/map/map-normalizer.cc @@ -42,10 +42,16 @@ UChar32 NormalizeChar(UChar32 c) { } // The original character can be encoded into a single char16_t. - const std::unordered_map<char16_t, char16_t>& normalization_map = + const std::unordered_map<char16_t, char16_t>* normalization_map = GetNormalizationMap(); - auto iterator = normalization_map.find(static_cast<char16_t>(c)); - if (iterator == normalization_map.end()) { + if (normalization_map == nullptr) { + // Normalization map couldn't be properly initialized, append the original + // character. + ICING_LOG(WARNING) << "Unable to get a valid pointer to normalization map!"; + return c; + } + auto iterator = normalization_map->find(static_cast<char16_t>(c)); + if (iterator == normalization_map->end()) { // Normalization mapping not found, append the original character. return c; } @@ -99,7 +105,7 @@ std::string MapNormalizer::NormalizeTerm(std::string_view term) const { return normalized_text; } -CharacterIterator MapNormalizer::CalculateNormalizedMatchLength( +CharacterIterator MapNormalizer::FindNormalizedMatchEndPosition( std::string_view term, std::string_view normalized_term) const { CharacterIterator char_itr(term); CharacterIterator normalized_char_itr(normalized_term); diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h index 8fbe83b..ed996ae 100644 --- a/icing/transform/map/map-normalizer.h +++ b/icing/transform/map/map-normalizer.h @@ -26,8 +26,6 @@ namespace lib { class MapNormalizer : public Normalizer { public: - static constexpr std::string_view kName = "MapNormalizer"; - explicit MapNormalizer(int max_term_byte_size) : max_term_byte_size_(max_term_byte_size){}; @@ -45,12 +43,12 @@ class MapNormalizer : public Normalizer { // Returns a CharacterIterator pointing to one past the end of the segment of // term that (once normalized) matches with normalized_term. // - // Ex. CalculateNormalizedMatchLength("YELLOW", "yell") will return + // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return // CharacterIterator(u8:4, u16:4, u32:4). // - // Ex. CalculateNormalizedMatchLength("YELLOW", "red") will return + // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return // CharacterIterator(u8:0, u16:0, u32:0). - CharacterIterator CalculateNormalizedMatchLength( + CharacterIterator FindNormalizedMatchEndPosition( std::string_view term, std::string_view normalized_term) const override; private: diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc index 691afc6..8268541 100644 --- a/icing/transform/map/map-normalizer_benchmark.cc +++ b/icing/transform/map/map-normalizer_benchmark.cc @@ -143,6 +143,104 @@ BENCHMARK(BM_NormalizeHiragana) ->Arg(2048000) ->Arg(4096000); +void BM_UppercaseSubTokenLength(benchmark::State& state) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string(state.range(0), 'A'); + std::string normalized_input_string(state.range(0), 'a'); + for (auto _ : state) { + normalizer->FindNormalizedMatchEndPosition(input_string, + normalized_input_string); + } +} +BENCHMARK(BM_UppercaseSubTokenLength) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +void BM_AccentSubTokenLength(benchmark::State& state) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string; + std::string normalized_input_string; + while (input_string.length() < state.range(0)) { + input_string.append("àáâãā"); + normalized_input_string.append("aaaaa"); + } + + for (auto _ : state) { + normalizer->FindNormalizedMatchEndPosition(input_string, + normalized_input_string); + } +} +BENCHMARK(BM_AccentSubTokenLength) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + +void BM_HiraganaSubTokenLength(benchmark::State& state) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Normalizer> normalizer, + normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits<int>::max())); + + std::string input_string; + std::string normalized_input_string; + while (input_string.length() < state.range(0)) { + input_string.append("あいうえお"); + normalized_input_string.append("アイウエオ"); + } + + for (auto _ : state) { + normalizer->FindNormalizedMatchEndPosition(input_string, + normalized_input_string); + } +} +BENCHMARK(BM_HiraganaSubTokenLength) + ->Arg(1000) + ->Arg(2000) + ->Arg(4000) + ->Arg(8000) + ->Arg(16000) + ->Arg(32000) + ->Arg(64000) + ->Arg(128000) + ->Arg(256000) + ->Arg(384000) + ->Arg(512000) + ->Arg(1024000) + ->Arg(2048000) + ->Arg(4096000); + } // namespace } // namespace lib diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc index 26fdd4a..adc5623 100644 --- a/icing/transform/map/map-normalizer_test.cc +++ b/icing/transform/map/map-normalizer_test.cc @@ -201,51 +201,103 @@ TEST(MapNormalizerTest, Truncate) { } TEST(MapNormalizerTest, PrefixMatchLength) { + // Verify that FindNormalizedMatchEndPosition will properly find the length of + // the prefix match when given a non-normalized term and a normalized term + // is a prefix of the non-normalized one. ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( /*max_term_byte_size=*/1000)); // Upper to lower std::string term = "MDI"; CharacterIterator match_end = - normalizer->CalculateNormalizedMatchLength(term, "md"); + normalizer->FindNormalizedMatchEndPosition(term, "md"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD")); term = "Icing"; - match_end = normalizer->CalculateNormalizedMatchLength(term, "icin"); + match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin")); // Full-width term = "525600"; - match_end = normalizer->CalculateNormalizedMatchLength(term, "525"); + match_end = normalizer->FindNormalizedMatchEndPosition(term, "525"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); term = "FULLWIDTH"; - match_end = normalizer->CalculateNormalizedMatchLength(term, "full"); + match_end = normalizer->FindNormalizedMatchEndPosition(term, "full"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); // Hiragana to Katakana term = "あいうえお"; - match_end = normalizer->CalculateNormalizedMatchLength(term, "アイ"); + match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); term = "かきくけこ"; - match_end = normalizer->CalculateNormalizedMatchLength(term, "カ"); + match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); // Latin accents term = "Zürich"; - match_end = normalizer->CalculateNormalizedMatchLength(term, "zur"); + match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); term = "après-midi"; - match_end = normalizer->CalculateNormalizedMatchLength(term, "apre"); + match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); term = "Buenos días"; - match_end = normalizer->CalculateNormalizedMatchLength(term, "buenos di"); + match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di"); EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí")); } +TEST(MapNormalizerTest, SharedPrefixMatchLength) { + // Verify that FindNormalizedMatchEndPosition will properly find the length of + // the prefix match when given a non-normalized term and a normalized term + // that share a common prefix. + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + // Upper to lower + std::string term = "MDI"; + CharacterIterator match_end = + normalizer->FindNormalizedMatchEndPosition(term, "mgm"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M")); + + term = "Icing"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic")); + + // Full-width + term = "525600"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); + + term = "FULLWIDTH"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); + + // Hiragana to Katakana + term = "あいうえお"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); + + term = "かきくけこ"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); + + // Latin accents + term = "Zürich"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); + + term = "après-midi"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); + + term = "días"; + match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día")); +} + } // namespace } // namespace lib diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc index c318036..0994ab8 100644 --- a/icing/transform/map/normalization-map.cc +++ b/icing/transform/map/normalization-map.cc @@ -691,19 +691,21 @@ constexpr NormalizationPair kNormalizationMappings[] = { } // namespace -const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() { +const std::unordered_map<char16_t, char16_t> *GetNormalizationMap() { // The map is allocated dynamically the first time this function is executed. - static const std::unordered_map<char16_t, char16_t> normalization_map = [] { - std::unordered_map<char16_t, char16_t> map; - // Size of all the mappings is about 2.5 KiB. - constexpr int numMappings = - sizeof(kNormalizationMappings) / sizeof(NormalizationPair); - map.reserve(numMappings); - for (size_t i = 0; i < numMappings; ++i) { - map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to); - } - return map; - }(); + static const std::unordered_map<char16_t, char16_t> *const normalization_map = + [] { + auto *map = new std::unordered_map<char16_t, char16_t>(); + // Size of all the mappings is about 2.5 KiB. + constexpr int numMappings = + sizeof(kNormalizationMappings) / sizeof(NormalizationPair); + map->reserve(numMappings); + for (size_t i = 0; i < numMappings; ++i) { + map->emplace(kNormalizationMappings[i].from, + kNormalizationMappings[i].to); + } + return map; + }(); return normalization_map; } diff --git a/icing/transform/map/normalization-map.h b/icing/transform/map/normalization-map.h index aea85bd..ac7872b 100644 --- a/icing/transform/map/normalization-map.h +++ b/icing/transform/map/normalization-map.h @@ -23,7 +23,7 @@ namespace lib { // Returns a map containing normalization mappings. A mapping (A -> B) means // that we'll transform every character 'A' into 'B'. See normalization-map.cc // for mapping details. -const std::unordered_map<char16_t, char16_t>& GetNormalizationMap(); +const std::unordered_map<char16_t, char16_t>* GetNormalizationMap(); } // namespace lib } // namespace icing diff --git a/icing/transform/normalizer-factory.h b/icing/transform/normalizer-factory.h index 1db9915..f1f3f62 100644 --- a/icing/transform/normalizer-factory.h +++ b/icing/transform/normalizer-factory.h @@ -36,9 +36,6 @@ namespace normalizer_factory { libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( int max_term_byte_size); -// Returns the name of the normalizer being used. -std::string_view GetNormalizerName(); - } // namespace normalizer_factory } // namespace lib diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h index 7305c46..2110f0f 100644 --- a/icing/transform/normalizer.h +++ b/icing/transform/normalizer.h @@ -44,17 +44,13 @@ class Normalizer { // Returns a CharacterIterator pointing to one past the end of the segment of // term that (once normalized) matches with normalized_term. // - // Ex. CalculateNormalizedMatchLength("YELLOW", "yell") will return + // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return // CharacterIterator(u8:4, u16:4, u32:4). // - // Ex. CalculateNormalizedMatchLength("YELLOW", "red") will return + // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return // CharacterIterator(u8:0, u16:0, u32:0). - virtual CharacterIterator CalculateNormalizedMatchLength( - std::string_view term, std::string_view normalized_term) const { - // TODO(b/195720764) Remove this default impl and implement in all - // subclasses. - return CharacterIterator(term, 0, 0, 0); - } + virtual CharacterIterator FindNormalizedMatchEndPosition( + std::string_view term, std::string_view normalized_term) const = 0; }; } // namespace lib diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc index cd0a227..ec327ad 100644 --- a/icing/util/i18n-utils.cc +++ b/icing/util/i18n-utils.cc @@ -116,6 +116,8 @@ bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); } bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; } +bool IsAlphaNumeric(UChar32 c) { return u_isalnum(c); } + int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); } int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); } diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h index 82ae828..491df6b 100644 --- a/icing/util/i18n-utils.h +++ b/icing/util/i18n-utils.h @@ -67,6 +67,9 @@ bool IsAscii(char c); // Checks if the Unicode char is within ASCII range. bool IsAscii(UChar32 c); +// Checks if the Unicode char is alphanumeric. +bool IsAlphaNumeric(UChar32 c); + // Returns how many code units (char) are used for the UTF-8 encoding of this // Unicode character. Returns 0 if not valid. int GetUtf8Length(UChar32 c); diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index f0c066f..d57de81 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=390638574) +set(synced_AOSP_CL_number=395331611) |