aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMy Name <dsaadati@google.com>2021-09-09 11:11:03 -0700
committerDan Saadati <dsaadati@google.com>2021-09-09 11:27:04 -0700
commit34fc8c85b9f690ffd0a095a4bbcac9aaacfa387b (patch)
tree0b9bbc9ca107551327b396e7c355b65c4baa43ef
parent14ee9a8eb8f3ed47f68117208626045878c943ac (diff)
parent39f59853b980d94a55e9b0f76185b0d3fff88455 (diff)
downloadicing-34fc8c85b9f690ffd0a095a4bbcac9aaacfa387b.tar.gz
Merge remote-tracking branch 'aosp/upstream-master' into androidx-main
* aosp/upstream-master: Sync from upstream. Descriptions: ================ Remove no-longer-used write paths for file-backed-proto-log. ================ Modify segmentation rules to consider any segment that begins with a non-Ascii alphanumeric character as valid ================= Implement CalculateNormalizedMatchLength for IcuNormalizer. ================ Add additional benchmark cases that were useful in developing submatching and CalculateNormalizedMatchLength for IcuNormalizer ================= Switch NormalizationMap from static const std::unordered_map<char16_t, char16_t>& to static const std::unordered_map<char16_t, char16_t> *const. ================== Bug: 147509515 Bug: 149610413 Bug: 195720764 Bug: 196257995 Change-Id: I8e8d7a7fcceb8eaae1fdcb45a92ea4399d47f343
-rw-r--r--icing/file/file-backed-proto-log.h348
-rw-r--r--icing/file/file-backed-proto-log_benchmark.cc251
-rw-r--r--icing/file/file-backed-proto-log_test.cc573
-rw-r--r--icing/icing-search-engine_benchmark.cc61
-rw-r--r--icing/result/snippet-retriever.cc6
-rw-r--r--icing/result/snippet-retriever_test.cc112
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc7
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc9
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc9
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc11
-rw-r--r--icing/transform/icu/icu-normalizer-factory.cc2
-rw-r--r--icing/transform/icu/icu-normalizer.cc118
-rw-r--r--icing/transform/icu/icu-normalizer.h20
-rw-r--r--icing/transform/icu/icu-normalizer_benchmark.cc118
-rw-r--r--icing/transform/icu/icu-normalizer_test.cc98
-rw-r--r--icing/transform/map/map-normalizer-factory.cc2
-rw-r--r--icing/transform/map/map-normalizer.cc14
-rw-r--r--icing/transform/map/map-normalizer.h8
-rw-r--r--icing/transform/map/map-normalizer_benchmark.cc98
-rw-r--r--icing/transform/map/map-normalizer_test.cc70
-rw-r--r--icing/transform/map/normalization-map.cc26
-rw-r--r--icing/transform/map/normalization-map.h2
-rw-r--r--icing/transform/normalizer-factory.h3
-rw-r--r--icing/transform/normalizer.h12
-rw-r--r--icing/util/i18n-utils.cc2
-rw-r--r--icing/util/i18n-utils.h3
-rw-r--r--synced_AOSP_CL_number.txt2
27 files changed, 665 insertions, 1320 deletions
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index b2b37e8..cf16b4f 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -14,16 +14,14 @@
// File-backed log of protos with append-only writes and position based reads.
//
-// There should only be one instance of a FileBackedProtoLog of the same file at
-// a time; using multiple instances at the same time may lead to undefined
-// behavior.
+// The implementation in this file is deprecated and replaced by
+// portable-file-backed-proto-log.h.
//
-// The entire checksum is computed on initialization to verify the contents are
-// valid. On failure, the log will be truncated to the last verified state when
-// PersistToDisk() was called. If the log cannot successfully restore the last
-// state due to disk corruption or some other inconsistency, then the entire log
-// will be lost.
+// This deprecated implementation has been made read-only for the purposes of
+// migration; writing and erasing this format of log is no longer supported and
+// the methods to accomplish this have been removed.
//
+// The details of this format follow below:
// Each proto written to the file will have a metadata written just before it.
// The metadata consists of
// {
@@ -31,37 +29,16 @@
// 3 bytes of the proto size
// n bytes of the proto itself
// }
-//
-// Example usage:
-// ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
-// FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_,
-// options));
-// auto proto_log = create_result.proto_log;
-//
-// Document document;
-// document.set_namespace("com.google.android.example");
-// document.set_uri("www.google.com");
-//
-// int64_t document_offset = proto_log->WriteProto(document));
-// Document same_document = proto_log->ReadProto(document_offset));
-// proto_log->PersistToDisk();
-//
// TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
// migration method.
-
#ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
#define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
-#include <cstddef>
#include <cstdint>
-#include <cstring>
#include <memory>
#include <string>
#include <string_view>
-#include <utility>
-#include <vector>
-#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include <google/protobuf/io/gzip_stream.h>
#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
@@ -112,10 +89,6 @@ class FileBackedProtoLog {
// Header stored at the beginning of the file before the rest of the log
// contents. Stores metadata on the log.
- //
- // TODO(b/139375388): Migrate the Header struct to a proto. This makes
- // migrations easier since we don't need to worry about different size padding
- // (which would affect the checksum) and different endians.
struct Header {
static constexpr int32_t kMagic = 0xf4c6f67a;
@@ -195,20 +168,6 @@ class FileBackedProtoLog {
FileBackedProtoLog(const FileBackedProtoLog&) = delete;
FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
- // This will update the checksum of the log as well.
- ~FileBackedProtoLog();
-
- // Writes the serialized proto to the underlying file. Writes are applied
- // directly to the underlying file. Users do not need to sync the file after
- // writing.
- //
- // Returns:
- // Offset of the newly appended proto in file on success
- // INVALID_ARGUMENT if proto is too large, as decided by
- // Options.max_proto_size
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
-
// Reads out a proto located at file_offset from the file.
//
// Returns:
@@ -218,31 +177,6 @@ class FileBackedProtoLog {
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
- // Erases the data of a proto located at file_offset from the file.
- //
- // Returns:
- // OK on success
- // OUT_OF_RANGE_ERROR if file_offset exceeds file size
- // INTERNAL_ERROR on IO error
- libtextclassifier3::Status EraseProto(int64_t file_offset);
-
- // Calculates and returns the disk usage in bytes. Rounds up to the nearest
- // block size.
- //
- // Returns:
- // Disk usage on success
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
-
- // Returns the file size of all the elements held in the log. File size is in
- // bytes. This excludes the size of any internal metadata of the log, e.g. the
- // log's header.
- //
- // Returns:
- // File size on success
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
-
// An iterator helping to find offsets of all the protos in file.
// Example usage:
//
@@ -281,72 +215,6 @@ class FileBackedProtoLog {
// behaviors could happen.
Iterator GetIterator();
- // Persists all changes since initialization or the last call to
- // PersistToDisk(). Any changes that aren't persisted may be lost if the
- // system fails to close safely.
- //
- // Example use case:
- //
- // Document document;
- // document.set_namespace("com.google.android.example");
- // document.set_uri("www.google.com");
- //
- // {
- // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
- // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
- // options));
- // auto proto_log = std::move(create_result.proto_log);
- //
- // int64_t document_offset = proto_log->WriteProto(document));
- //
- // // We lose the document here since it wasn't persisted.
- // // *SYSTEM CRASH*
- // }
- //
- // {
- // // Can still successfully create after a crash since the log can
- // // rewind/truncate to recover into a previously good state
- // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
- // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
- // options));
- // auto proto_log = std::move(create_result.proto_log);
- //
- // // Lost the proto since we didn't PersistToDisk before the crash
- // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
- //
- // int64_t document_offset = proto_log->WriteProto(document));
- //
- // // Persisted this time, so we should be ok.
- // ICING_ASSERT_OK(proto_log->PersistToDisk());
- // }
- //
- // {
- // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
- // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
- // options));
- // auto proto_log = std::move(create_result.proto_log);
- //
- // // SUCCESS
- // Document same_document = proto_log->ReadProto(document_offset));
- // }
- //
- // NOTE: Since all protos are already written to the file directly, this
- // just updates the checksum and rewind position. Without these updates,
- // future initializations will truncate the file and discard unpersisted
- // changes.
- //
- // Returns:
- // OK on success
- // INTERNAL_ERROR on IO error
- libtextclassifier3::Status PersistToDisk();
-
- // Calculates the checksum of the log contents. Excludes the header content.
- //
- // Returns:
- // Crc of the log content
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
-
private:
// Object can only be instantiated via the ::Create factory.
FileBackedProtoLog(const Filesystem* filesystem, const std::string& file_path,
@@ -451,15 +319,6 @@ FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
}
template <typename ProtoT>
-FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() {
- if (!PersistToDisk().ok()) {
- ICING_LOG(WARNING)
- << "Error persisting to disk during destruction of FileBackedProtoLog: "
- << file_path_;
- }
-}
-
-template <typename ProtoT>
libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
const std::string& file_path,
@@ -688,79 +547,6 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
}
template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto(
- const ProtoT& proto) {
- int64_t proto_size = proto.ByteSizeLong();
- int32_t metadata;
- int metadata_size = sizeof(metadata);
- int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
-
- if (proto_size > header_->max_proto_size) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "proto_size, %lld, was too large to write. Max is %d",
- static_cast<long long>(proto_size), header_->max_proto_size));
- }
-
- // At this point, we've guaranteed that proto_size is under kMaxProtoSize
- // (see
- // ::Create), so we can safely store it in an int.
- int final_size = 0;
-
- std::string proto_str;
- google::protobuf::io::StringOutputStream proto_stream(&proto_str);
-
- if (header_->compress) {
- google::protobuf::io::GzipOutputStream::Options options;
- options.format = google::protobuf::io::GzipOutputStream::ZLIB;
- options.compression_level = kDeflateCompressionLevel;
-
- google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
- options);
-
- bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
- compressing_stream.Close();
-
- if (!success) {
- return absl_ports::InternalError("Error compressing proto.");
- }
-
- final_size = proto_str.size();
-
- // In case the compressed proto is larger than the original proto, we also
- // can't write it.
- if (final_size > header_->max_proto_size) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Compressed proto size, %d, was greater than "
- "max_proto_size, %d",
- final_size, header_->max_proto_size));
- }
- } else {
- // Serialize the proto directly into the write buffer at an offset of the
- // metadata.
- proto.SerializeToZeroCopyStream(&proto_stream);
- final_size = proto_str.size();
- }
-
- // 1st byte for magic, next 3 bytes for proto size.
- metadata = (kProtoMagic << 24) | final_size;
-
- // Actually write metadata, has to be done after we know the possibly
- // compressed proto size
- if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to write proto metadata to: ", file_path_));
- }
-
- // Write the serialized proto
- if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to write proto to: ", file_path_));
- }
-
- return current_position;
-}
-
-template <typename ProtoT>
libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
int64_t file_offset) const {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
@@ -806,83 +592,6 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
}
template <typename ProtoT>
-libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto(
- int64_t file_offset) {
- int64_t file_size = filesystem_->GetFileSize(fd_.get());
- if (file_offset >= file_size) {
- // file_size points to the next byte to write at, so subtract one to get
- // the inclusive, actual size of file.
- return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
- "Trying to erase data at a location, %lld, "
- "out of range of the file size, %lld",
- static_cast<long long>(file_offset),
- static_cast<long long>(file_size - 1)));
- }
-
- MemoryMappedFile mmapped_file(
- *filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
-
- // Read out the metadata
- ICING_ASSIGN_OR_RETURN(
- int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
-
- ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
- GetProtoSize(metadata)));
-
- // We need to update the crc checksum if the erased area is before the
- // rewind position.
- if (file_offset + sizeof(metadata) < header_->rewind_offset) {
- // We need to calculate [original string xor 0s].
- // The xored string is the same as the original string because 0 xor 0 =
- // 0, 1 xor 0 = 1.
- const std::string_view xored_str(mmapped_file.region(),
- mmapped_file.region_size());
-
- Crc32 crc(header_->log_checksum);
- ICING_ASSIGN_OR_RETURN(
- uint32_t new_crc,
- crc.UpdateWithXor(
- xored_str,
- /*full_data_size=*/header_->rewind_offset - sizeof(Header),
- /*position=*/file_offset + sizeof(metadata) - sizeof(Header)));
-
- header_->log_checksum = new_crc;
- header_->header_checksum = header_->CalculateHeaderChecksum();
-
- if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
- sizeof(Header))) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to update header to: ", file_path_));
- }
- }
-
- memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
- return libtextclassifier3::Status::OK;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
- const {
- int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
- if (size == Filesystem::kBadFileSize) {
- return absl_ports::InternalError("Failed to get disk usage of proto log");
- }
- return size;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t>
-FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
- int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
- if (total_file_size == Filesystem::kBadFileSize) {
- return absl_ports::InternalError(
- "Failed to get file size of elments in the proto log");
- }
- return total_file_size - sizeof(Header);
-}
-
-template <typename ProtoT>
FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
const std::string& file_path,
int64_t initial_offset)
@@ -964,51 +673,6 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
return metadata;
}
-template <typename ProtoT>
-libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
- int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
- if (file_size == header_->rewind_offset) {
- // No new protos appended, don't need to update the checksum.
- return libtextclassifier3::Status::OK;
- }
-
- int64_t new_content_size = file_size - header_->rewind_offset;
- Crc32 crc;
- if (new_content_size < 0) {
- // File shrunk, recalculate the entire checksum.
- ICING_ASSIGN_OR_RETURN(
- crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header),
- file_size));
- } else {
- // Append new changes to the existing checksum.
- ICING_ASSIGN_OR_RETURN(
- crc,
- ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum),
- header_->rewind_offset, file_size));
- }
-
- header_->log_checksum = crc.Get();
- header_->rewind_offset = file_size;
- header_->header_checksum = header_->CalculateHeaderChecksum();
-
- if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
- sizeof(Header)) ||
- !filesystem_->DataSync(fd_.get())) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to update header to: ", file_path_));
- }
-
- return libtextclassifier3::Status::OK;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<Crc32>
-FileBackedProtoLog<ProtoT>::ComputeChecksum() {
- return FileBackedProtoLog<ProtoT>::ComputeChecksum(
- filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header),
- /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc
deleted file mode 100644
index c09fd5a..0000000
--- a/icing/file/file-backed-proto-log_benchmark.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdint>
-#include <random>
-
-#include "testing/base/public/benchmark.h"
-#include "gmock/gmock.h"
-#include "icing/document-builder.h"
-#include "icing/file/file-backed-proto-log.h"
-#include "icing/file/filesystem.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/proto/document.pb.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/random-string.h"
-#include "icing/testing/tmp-directory.h"
-
-// go/microbenchmarks
-//
-// To build and run on a local machine:
-// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
-// icing/file:file-backed-proto-log_benchmark
-//
-// $ blaze-bin/icing/file/file-backed-proto-log_benchmark
-// --benchmarks=all
-//
-//
-// To build and run on an Android device (must be connected and rooted):
-// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
-// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
-// icing/file:file-backed-proto-log_benchmark
-//
-// $ adb root
-//
-// $ adb push
-// blaze-bin/icing/file/file-backed-proto-log_benchmark
-// /data/local/tmp/
-//
-// $ adb shell /data/local/tmp/file-backed-proto-log-benchmark
-// --benchmarks=all
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-static void BM_Write(benchmark::State& state) {
- const Filesystem filesystem;
- int string_length = state.range(0);
- const std::string file_path = IcingStringUtil::StringPrintf(
- "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
- int max_proto_size = (1 << 24) - 1; // 16 MiB
- bool compress = true;
-
- // Make sure it doesn't already exist.
- filesystem.DeleteFile(file_path.c_str());
-
- auto proto_log =
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem, file_path,
- FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
- .ValueOrDie()
- .proto_log;
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- std::default_random_engine random;
- const std::string rand_str =
- RandomString(kAlNumAlphabet, string_length, &random);
-
- auto document_properties = document.add_properties();
- document_properties->set_name("string property");
- document_properties->add_string_values(rand_str);
-
- for (auto _ : state) {
- testing::DoNotOptimize(proto_log->WriteProto(document));
- }
- state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
- string_length);
-
- // Cleanup after ourselves
- filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_Write)
- ->Arg(1)
- ->Arg(32)
- ->Arg(512)
- ->Arg(1024)
- ->Arg(4 * 1024)
- ->Arg(8 * 1024)
- ->Arg(16 * 1024)
- ->Arg(32 * 1024)
- ->Arg(256 * 1024)
- ->Arg(2 * 1024 * 1024)
- ->Arg(8 * 1024 * 1024)
- ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
- // 16MiB, and we need some extra space for the
- // rest of the document properties
-
-static void BM_Read(benchmark::State& state) {
- const Filesystem filesystem;
- int string_length = state.range(0);
- const std::string file_path = IcingStringUtil::StringPrintf(
- "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
- int max_proto_size = (1 << 24) - 1; // 16 MiB
- bool compress = true;
-
- // Make sure it doesn't already exist.
- filesystem.DeleteFile(file_path.c_str());
-
- auto proto_log =
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem, file_path,
- FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
- .ValueOrDie()
- .proto_log;
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- std::default_random_engine random;
- const std::string rand_str =
- RandomString(kAlNumAlphabet, string_length, &random);
-
- auto document_properties = document.add_properties();
- document_properties->set_name("string property");
- document_properties->add_string_values(rand_str);
-
- ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
- proto_log->WriteProto(document));
-
- for (auto _ : state) {
- testing::DoNotOptimize(proto_log->ReadProto(write_offset));
- }
- state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
- string_length);
-
- // Cleanup after ourselves
- filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_Read)
- ->Arg(1)
- ->Arg(32)
- ->Arg(512)
- ->Arg(1024)
- ->Arg(4 * 1024)
- ->Arg(8 * 1024)
- ->Arg(16 * 1024)
- ->Arg(32 * 1024)
- ->Arg(256 * 1024)
- ->Arg(2 * 1024 * 1024)
- ->Arg(8 * 1024 * 1024)
- ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
- // 16MiB, and we need some extra space for the
- // rest of the document properties
-
-static void BM_Erase(benchmark::State& state) {
- const Filesystem filesystem;
- const std::string file_path = IcingStringUtil::StringPrintf(
- "%s%s", GetTestTempDir().c_str(), "/proto.log");
- int max_proto_size = (1 << 24) - 1; // 16 MiB
- bool compress = true;
-
- // Make sure it doesn't already exist.
- filesystem.DeleteFile(file_path.c_str());
-
- auto proto_log =
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem, file_path,
- FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
- .ValueOrDie()
- .proto_log;
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- std::default_random_engine random;
- const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random);
-
- auto document_properties = document.add_properties();
- document_properties->set_name("string property");
- document_properties->add_string_values(rand_str);
-
- for (auto _ : state) {
- state.PauseTiming();
- ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
- proto_log->WriteProto(document));
- state.ResumeTiming();
-
- testing::DoNotOptimize(proto_log->EraseProto(write_offset));
- }
-
- // Cleanup after ourselves
- filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_Erase);
-
-static void BM_ComputeChecksum(benchmark::State& state) {
- const Filesystem filesystem;
- const std::string file_path = GetTestTempDir() + "/proto.log";
- int max_proto_size = (1 << 24) - 1; // 16 MiB
- bool compress = true;
-
- // Make sure it doesn't already exist.
- filesystem.DeleteFile(file_path.c_str());
-
- auto proto_log =
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem, file_path,
- FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
- .ValueOrDie()
- .proto_log;
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- // Make each document 1KiB
- int string_length = 1024;
- std::default_random_engine random;
- const std::string rand_str =
- RandomString(kAlNumAlphabet, string_length, &random);
-
- auto document_properties = document.add_properties();
- document_properties->set_name("string property");
- document_properties->add_string_values(rand_str);
-
- int num_docs = state.range(0);
- for (int i = 0; i < num_docs; ++i) {
- ICING_ASSERT_OK(proto_log->WriteProto(document));
- }
-
- for (auto _ : state) {
- testing::DoNotOptimize(proto_log->ComputeChecksum());
- }
-
- // Cleanup after ourselves
- filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
index d429277..eccb0c7 100644
--- a/icing/file/file-backed-proto-log_test.cc
+++ b/icing/file/file-backed-proto-log_test.cc
@@ -19,10 +19,7 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/file/mock-filesystem.h"
-#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
@@ -32,14 +29,7 @@ namespace lib {
namespace {
-using ::icing::lib::portable_equals_proto::EqualsProto;
-using ::testing::A;
-using ::testing::Eq;
-using ::testing::Gt;
-using ::testing::Not;
using ::testing::NotNull;
-using ::testing::Pair;
-using ::testing::Return;
class FileBackedProtoLogTest : public ::testing::Test {
protected:
@@ -87,193 +77,6 @@ TEST_F(FileBackedProtoLogTest, Initialize) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) {
- int max_proto_size = 1;
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- // Proto is too large for the max_proto_size_in
- ASSERT_THAT(proto_log->WriteProto(document),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Write a proto
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset,
- proto_log->WriteProto(document));
-
- // The 4 bytes of metadata that just doesn't have the same kProtoMagic
- // specified in file-backed-proto-log.h
- uint32_t wrong_magic = 0x7E000000;
-
- // Sanity check that we opened the file correctly
- int fd = filesystem_.OpenForWrite(file_path_.c_str());
- ASSERT_GT(fd, 0);
-
- // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of
- // a proto entry.
- filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic));
-
- ASSERT_THAT(proto_log->ReadProto(file_offset),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-}
-
-TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
- int last_offset;
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(
- /*compress_in=*/false, max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Write the first proto
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(int written_position,
- proto_log->WriteProto(document1));
-
- int document1_offset = written_position;
-
- // Check that what we read is what we wrote
- ASSERT_THAT(proto_log->ReadProto(written_position),
- IsOkAndHolds(EqualsProto(document1)));
-
- // Write a second proto that's close to the max size. Leave some room for
- // the rest of the proto properties.
- std::string long_str(max_proto_size_ - 1024, 'a');
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .AddStringProperty("long_str", long_str)
- .Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(written_position,
- proto_log->WriteProto(document2));
-
- int document2_offset = written_position;
- last_offset = written_position;
- ASSERT_GT(document2_offset, document1_offset);
-
- // Check the second proto
- ASSERT_THAT(proto_log->ReadProto(written_position),
- IsOkAndHolds(EqualsProto(document2)));
-
- ICING_ASSERT_OK(proto_log->PersistToDisk());
- }
-
- {
- // Make a new proto_log with the same file_path, and make sure we
- // can still write to the same underlying file.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(
- /*compress_in=*/false, max_proto_size_)));
- auto recreated_proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Write a third proto
- DocumentProto document3 =
- DocumentBuilder().SetKey("namespace3", "uri3").Build();
-
- ASSERT_THAT(recreated_proto_log->WriteProto(document3),
- IsOkAndHolds(Gt(last_offset)));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
- int last_offset;
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(
- /*compress_in=*/true, max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Write the first proto
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(int written_position,
- proto_log->WriteProto(document1));
-
- int document1_offset = written_position;
-
- // Check that what we read is what we wrote
- ASSERT_THAT(proto_log->ReadProto(written_position),
- IsOkAndHolds(EqualsProto(document1)));
-
- // Write a second proto that's close to the max size. Leave some room for
- // the rest of the proto properties.
- std::string long_str(max_proto_size_ - 1024, 'a');
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .AddStringProperty("long_str", long_str)
- .Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(written_position,
- proto_log->WriteProto(document2));
-
- int document2_offset = written_position;
- last_offset = written_position;
- ASSERT_GT(document2_offset, document1_offset);
-
- // Check the second proto
- ASSERT_THAT(proto_log->ReadProto(written_position),
- IsOkAndHolds(EqualsProto(document2)));
-
- ICING_ASSERT_OK(proto_log->PersistToDisk());
- }
-
- {
- // Make a new proto_log with the same file_path, and make sure we
- // can still write to the same underlying file.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(
- /*compress_in=*/true, max_proto_size_)));
- auto recreated_proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Write a third proto
- DocumentProto document3 =
- DocumentBuilder().SetKey("namespace3", "uri3").Build();
-
- ASSERT_THAT(recreated_proto_log->WriteProto(document3),
- IsOkAndHolds(Gt(last_offset)));
- }
-}
-
TEST_F(FileBackedProtoLogTest, CorruptHeader) {
{
ICING_ASSERT_OK_AND_ASSIGN(
@@ -303,382 +106,6 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) {
}
}
-TEST_F(FileBackedProtoLogTest, CorruptContent) {
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.has_data_loss());
-
- DocumentProto document =
- DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
- // Write and persist an document.
- ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
- proto_log->WriteProto(document));
- ICING_ASSERT_OK(proto_log->PersistToDisk());
-
- // "Corrupt" the content written in the log.
- document.set_uri("invalid");
- std::string serialized_document = document.SerializeAsString();
- filesystem_.PWrite(file_path_.c_str(), document_offset,
- serialized_document.data(), serialized_document.size());
- }
-
- {
- // We can recover, but we have data loss.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_TRUE(create_result.has_data_loss());
- ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
-
- // Lost everything in the log since the rewind position doesn't help if
- // there's been data corruption within the persisted region
- ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
- sizeof(FileBackedProtoLog<DocumentProto>::Header));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, PersistToDisk) {
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace1", "uri1").Build();
- DocumentProto document2 =
- DocumentBuilder().SetKey("namespace2", "uri2").Build();
- int document1_offset, document2_offset;
- int log_size;
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Write and persist the first proto
- ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
- proto_log->WriteProto(document1));
- ICING_ASSERT_OK(proto_log->PersistToDisk());
-
- // Write, but don't explicitly persist the second proto
- ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
- proto_log->WriteProto(document2));
-
- // Check that what we read is what we wrote
- ASSERT_THAT(proto_log->ReadProto(document1_offset),
- IsOkAndHolds(EqualsProto(document1)));
- ASSERT_THAT(proto_log->ReadProto(document2_offset),
- IsOkAndHolds(EqualsProto(document2)));
-
- log_size = filesystem_.GetFileSize(file_path_.c_str());
- ASSERT_GT(log_size, 0);
- }
-
- {
- // The header rewind position and checksum aren't updated in this "system
- // crash" scenario.
-
- std::string bad_proto =
- "some incomplete proto that we didn't finish writing before the system "
- "crashed";
- filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(),
- bad_proto.size());
-
- // Double check that we actually wrote something to the underlying file
- ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size);
- }
-
- {
- // We can recover, but we have data loss
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_TRUE(create_result.has_data_loss());
- ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
-
- // Check that everything was persisted across instances
- ASSERT_THAT(proto_log->ReadProto(document1_offset),
- IsOkAndHolds(EqualsProto(document1)));
- ASSERT_THAT(proto_log->ReadProto(document2_offset),
- IsOkAndHolds(EqualsProto(document2)));
-
- // We correctly rewound to the last good state.
- ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str()));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, Iterator) {
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace", "uri1").Build();
- DocumentProto document2 =
- DocumentBuilder().SetKey("namespace", "uri2").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- {
- // Empty iterator
- auto iterator = proto_log->GetIterator();
- ASSERT_THAT(iterator.Advance(),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- }
-
- {
- // Iterates through some documents
- ICING_ASSERT_OK(proto_log->WriteProto(document1));
- ICING_ASSERT_OK(proto_log->WriteProto(document2));
- auto iterator = proto_log->GetIterator();
- // 1st proto
- ICING_ASSERT_OK(iterator.Advance());
- ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
- IsOkAndHolds(EqualsProto(document1)));
- // 2nd proto
- ICING_ASSERT_OK(iterator.Advance());
- ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
- IsOkAndHolds(EqualsProto(document2)));
- // Tries to advance
- ASSERT_THAT(iterator.Advance(),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- }
-
- {
- // Iterator with bad filesystem
- MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
- .WillByDefault(Return(Filesystem::kBadFileSize));
- FileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
- mock_filesystem, file_path_, /*initial_offset=*/0);
- ASSERT_THAT(bad_iterator.Advance(),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
- Crc32 checksum;
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- ICING_EXPECT_OK(proto_log->WriteProto(document));
-
- ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum());
-
- // Calling it twice with no changes should get us the same checksum
- EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
- }
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Checksum should be consistent across instances
- EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
-
- // PersistToDisk shouldn't affect the checksum value
- ICING_EXPECT_OK(proto_log->PersistToDisk());
- EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
-
- // Check that modifying the log leads to a different checksum
- ICING_EXPECT_OK(proto_log->WriteProto(document));
- EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) {
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace", "uri1").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Writes and erases proto
- ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
- proto_log->WriteProto(document1));
- ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
-
- // Checks if the erased area is set to 0.
- int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
- MemoryMappedFile mmapped_file(filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_ONLY);
-
- // document1_offset + sizeof(int) is the start byte of the proto where
- // sizeof(int) is the size of the proto metadata.
- mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1);
- for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
- ASSERT_THAT(mmapped_file.region()[i], Eq(0));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace", "uri1").Build();
- DocumentProto document2 =
- DocumentBuilder().SetKey("namespace", "uri2").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Writes 2 protos
- ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
- proto_log->WriteProto(document1));
- ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
- proto_log->WriteProto(document2));
-
- // Erases the first proto
- ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
-
- // The first proto has been erased.
- ASSERT_THAT(proto_log->ReadProto(document1_offset),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // The second proto should be returned.
- ASSERT_THAT(proto_log->ReadProto(document2_offset),
- IsOkAndHolds(EqualsProto(document2)));
-}
-
-TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace", "uri1").Build();
- DocumentProto document2 =
- DocumentBuilder().SetKey("namespace", "uri2").Build();
- DocumentProto document3 =
- DocumentBuilder().SetKey("namespace", "uri3").Build();
- DocumentProto document4 =
- DocumentBuilder().SetKey("namespace", "uri4").Build();
-
- int64_t document2_offset;
- int64_t document3_offset;
-
- {
- // Erase data after the rewind position. This won't update the checksum
- // immediately.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Writes 3 protos
- ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
- proto_log->WriteProto(document1));
- ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
- proto_log->WriteProto(document2));
- ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
- proto_log->WriteProto(document3));
-
- // Erases the 1st proto, checksum won't be updated immediately because the
- // rewind position is 0.
- ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
-
- EXPECT_THAT(proto_log->ComputeChecksum(),
- IsOkAndHolds(Eq(Crc32(2293202502))));
- } // New checksum is updated in destructor.
-
- {
- // Erase data before the rewind position. This will update the checksum
- // immediately.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Erases the 2nd proto that is now before the rewind position. Checksum is
- // updated.
- ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
-
- EXPECT_THAT(proto_log->ComputeChecksum(),
- IsOkAndHolds(Eq(Crc32(639634028))));
- }
-
- {
- // Append data and erase data before the rewind position. This will update
- // the checksum twice: in EraseProto() and destructor.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.has_data_loss());
-
- // Append a new document which is after the rewind position.
- ICING_ASSERT_OK(proto_log->WriteProto(document4));
-
- // Erases the 3rd proto that is now before the rewind position. Checksum is
- // updated.
- ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
-
- EXPECT_THAT(proto_log->ComputeChecksum(),
- IsOkAndHolds(Eq(Crc32(1990198693))));
- } // Checksum is updated with the newly appended document.
-
- {
- // A successful creation means that the checksum matches.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.has_data_loss());
- }
-}
-
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc
index ba9aed1..5e610d5 100644
--- a/icing/icing-search-engine_benchmark.cc
+++ b/icing/icing-search-engine_benchmark.cc
@@ -43,7 +43,6 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/document-generator.h"
#include "icing/testing/random-string.h"
-#include "icing/testing/recorder-test-utils.h"
#include "icing/testing/schema-generator.h"
#include "icing/testing/tmp-directory.h"
@@ -178,12 +177,12 @@ class DestructibleDirectory {
};
std::vector<DocumentProto> GenerateRandomDocuments(
- EvenDistributionTypeSelector* type_selector, int num_docs) {
+ EvenDistributionTypeSelector* type_selector, int num_docs,
+ const std::vector<std::string>& language) {
std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces);
EvenDistributionNamespaceSelector namespace_selector(namespaces);
std::default_random_engine random;
- std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
UniformDistributionLanguageTokenGenerator<std::default_random_engine>
token_generator(language, &random);
@@ -227,8 +226,9 @@ void BM_IndexLatency(benchmark::State& state) {
ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
int num_docs = state.range(0);
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
const std::vector<DocumentProto> random_docs =
- GenerateRandomDocuments(&type_selector, num_docs);
+ GenerateRandomDocuments(&type_selector, num_docs, language);
Timer timer;
for (const DocumentProto& doc : random_docs) {
ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
@@ -271,6 +271,56 @@ BENCHMARK(BM_IndexLatency)
->ArgPair(1 << 15, 10)
->ArgPair(1 << 17, 10);
+void BM_QueryLatency(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ std::default_random_engine random;
+ int num_types = kAvgNumNamespaces * kAvgNumTypes;
+ ExactStringPropertyGenerator property_generator;
+ SchemaGenerator<ExactStringPropertyGenerator> schema_generator(
+ /*num_properties=*/state.range(1), &property_generator);
+ SchemaProto schema = schema_generator.GenerateSchema(num_types);
+ EvenDistributionTypeSelector type_selector(schema);
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ int num_docs = state.range(0);
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
+ const std::vector<DocumentProto> random_docs =
+ GenerateRandomDocuments(&type_selector, num_docs, language);
+ for (const DocumentProto& doc : random_docs) {
+ ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
+ }
+
+ SearchSpecProto search_spec = CreateSearchSpec(
+ language.at(0), std::vector<std::string>(), TermMatchType::PREFIX);
+ ResultSpecProto result_spec = CreateResultSpec(1000000, 1000000, 1000000);
+ ScoringSpecProto scoring_spec =
+ CreateScoringSpec(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ for (auto _ : state) {
+ SearchResultProto results = icing->Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ }
+}
+BENCHMARK(BM_QueryLatency)
+ // Arguments: num_indexed_documents, num_sections
+ ->ArgPair(32, 2)
+ ->ArgPair(128, 2)
+ ->ArgPair(1 << 10, 2)
+ ->ArgPair(1 << 13, 2);
+
void BM_IndexThroughput(benchmark::State& state) {
// Initialize the filesystem
std::string test_dir = GetTestTempDir() + "/icing/benchmark";
@@ -297,8 +347,9 @@ void BM_IndexThroughput(benchmark::State& state) {
ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
int num_docs = state.range(0);
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
const std::vector<DocumentProto> random_docs =
- GenerateRandomDocuments(&type_selector, num_docs);
+ GenerateRandomDocuments(&type_selector, num_docs, language);
for (auto s : state) {
for (const DocumentProto& doc : random_docs) {
ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 33b343e..c46762e 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -109,7 +109,7 @@ class TokenMatcherExact : public TokenMatcher {
}
if (itr != unrestricted_query_terms_.end() &&
itr != restricted_query_terms_.end()) {
- return normalizer_.CalculateNormalizedMatchLength(token.text, *itr);
+ return normalizer_.FindNormalizedMatchEndPosition(token.text, *itr);
}
return CharacterIterator(token.text, -1, -1, -1);
}
@@ -135,14 +135,14 @@ class TokenMatcherPrefix : public TokenMatcher {
for (const std::string& query_term : unrestricted_query_terms_) {
if (query_term.length() <= s.length() &&
s.compare(0, query_term.length(), query_term) == 0) {
- return normalizer_.CalculateNormalizedMatchLength(token.text,
+ return normalizer_.FindNormalizedMatchEndPosition(token.text,
query_term);
}
}
for (const std::string& query_term : restricted_query_terms_) {
if (query_term.length() <= s.length() &&
s.compare(0, query_term.length(), query_term) == 0) {
- return normalizer_.CalculateNormalizedMatchLength(token.text,
+ return normalizer_.FindNormalizedMatchEndPosition(token.text,
query_term);
}
}
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index ad70038..f811941 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -691,10 +691,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
EXPECT_THAT(GetWindows(content, snippet.entries(0)),
ElementsAre("subject foo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
-
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
}
TEST_F(SnippetRetrieverTest, ExactSnippeting) {
@@ -738,9 +735,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
@@ -787,19 +782,15 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
"we need to begin considering our options regarding body bar."));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("foo", "bar"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("foo", "bar"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("foo", "bar"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)),
ElementsAre("subject foo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
@@ -849,10 +840,8 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
"we need to begin considering our options regarding body bar."));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("foo", "bar"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("foo", "bar"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("foo", "bar"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
@@ -903,20 +892,16 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
"Concerning the subject of foo, we need to begin considering our"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)),
ElementsAre("subject", "foo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("subject", "foo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("subject", "foo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)),
ElementsAre("subject foo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
- ElementsAre("subject"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
+ ElementsAre("subject"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
@@ -960,18 +945,14 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
ElementsAre(
"Concerning the subject of foo, we need to begin considering our"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)),
ElementsAre("subject foo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
@@ -993,9 +974,7 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
}
TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
@@ -1020,10 +999,8 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
ElementsAre("Some members are in Zürich."));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("Zürich"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("Zürich"));
}
TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
@@ -1084,20 +1061,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("polo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
-
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
- ElementsAre("polo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetPropertyPaths(snippet),
ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]"));
@@ -1194,19 +1164,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("polo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
- ElementsAre("polo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(
GetPropertyPaths(snippet),
@@ -1309,19 +1273,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("polo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
- ElementsAre("polo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetPropertyPaths(snippet),
ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]",
@@ -1422,19 +1380,13 @@ TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
GetString(&document, snippet.entries(0).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
- ElementsAre("polo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X"));
content = GetString(&document, snippet.entries(1).property_name());
EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
- ElementsAre("polo"));
- }
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
EXPECT_THAT(
GetPropertyPaths(snippet),
@@ -1478,16 +1430,12 @@ TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
// Ensure that the match is correct.
EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
- }
+ EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
// Ensure that the utf-16 values are also as expected
EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
- }
+ EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
}
TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
@@ -1587,16 +1535,12 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
// Ensure that the match is correct.
EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃"));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂"));
- }
+ EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂"));
// Ensure that the utf-16 values are also as expected
EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
- if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) {
- EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
- }
+ EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
}
TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index cb31441..598ede7 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -300,9 +300,10 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
term_start_index_);
- // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
- // We know it's an alphabetic term by checking the first unicode character.
- if (u_isUAlphabetic(uchar32)) {
+ // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned.
+ // We know it's an alphanumeric term by checking the first unicode
+ // character.
+ if (i18n_utils::IsAlphaNumeric(uchar32)) {
return true;
}
return false;
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 01eb7d8..3090087 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -372,6 +372,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
IsOkAndHolds(ElementsAre("-", "123")));
}
+TEST_P(IcuLanguageSegmenterAllLocalesTest, FullWidthNumbers) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"),
+ IsOkAndHolds(ElementsAre("0123456789")));
+}
+
TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index 76219b5..b936f2b 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -291,9 +291,12 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return true;
}
- // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
- // We know it's an alphabetic term by checking the first unicode character.
- if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
+ term_start_.utf8_index());
+ // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned.
+ // We know it's an alphanumeric term by checking the first unicode
+ // character.
+ if (i18n_utils::IsAlphaNumeric(uchar32)) {
return true;
}
return false;
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index b1a8f72..45d6475 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -366,6 +366,17 @@ TEST_P(ReverseJniLanguageSegmenterTest, Number) {
IsOkAndHolds(ElementsAre("-", "123")));
}
+TEST_P(ReverseJniLanguageSegmenterTest, FullWidthNumbers) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"),
+ IsOkAndHolds(ElementsAre("0", "1", "2", "3", "4", "5", "6",
+ "7", "8", "9")));
+}
+
TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
diff --git a/icing/transform/icu/icu-normalizer-factory.cc b/icing/transform/icu/icu-normalizer-factory.cc
index 9951325..493aeb5 100644
--- a/icing/transform/icu/icu-normalizer-factory.cc
+++ b/icing/transform/icu/icu-normalizer-factory.cc
@@ -44,8 +44,6 @@ libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
return IcuNormalizer::Create(max_term_byte_size);
}
-std::string_view GetNormalizerName() { return IcuNormalizer::kName; }
-
} // namespace normalizer_factory
} // namespace lib
diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc
index eb0eead..250d6cf 100644
--- a/icing/transform/icu/icu-normalizer.cc
+++ b/icing/transform/icu/icu-normalizer.cc
@@ -29,6 +29,7 @@
#include "icing/util/status-macros.h"
#include "unicode/umachine.h"
#include "unicode/unorm2.h"
+#include "unicode/ustring.h"
#include "unicode/utrans.h"
namespace icing {
@@ -157,14 +158,18 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
const std::string_view term) const {
std::string result;
result.reserve(term.length());
- for (int i = 0; i < term.length(); i++) {
- if (i18n_utils::IsAscii(term[i])) {
- result.push_back(std::tolower(term[i]));
- } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
- UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+ int current_pos = 0;
+ while (current_pos < term.length()) {
+ if (i18n_utils::IsAscii(term[current_pos])) {
+ result.push_back(std::tolower(term[current_pos]));
+ ++current_pos;
+ } else {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(term.data(), term.length(), current_pos);
if (uchar32 == i18n_utils::kInvalidUChar32) {
ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
- << " at position" << i;
+ << " at position" << current_pos;
+ current_pos += i18n_utils::GetUtf8Length(uchar32);
continue;
}
char ascii_char;
@@ -177,8 +182,9 @@ std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
// tokenized. We handle it here in case there're something wrong with
// the tokenizers.
int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- absl_ports::StrAppend(&result, term.substr(i, utf8_length));
+ absl_ports::StrAppend(&result, term.substr(current_pos, utf8_length));
}
+ current_pos += i18n_utils::GetUtf8Length(uchar32);
}
}
@@ -261,5 +267,103 @@ std::string IcuNormalizer::TermTransformer::Transform(
return std::move(utf8_term_or).ValueOrDie();
}
+CharacterIterator FindNormalizedLatinMatchEndPosition(
+ const UNormalizer2* normalizer2, std::string_view term,
+ CharacterIterator char_itr, std::string_view normalized_term) {
+ CharacterIterator normalized_char_itr(normalized_term);
+ char ascii_char;
+ while (char_itr.utf8_index() < term.length() &&
+ normalized_char_itr.utf8_index() < normalized_term.length()) {
+ UChar32 c = char_itr.GetCurrentChar();
+ if (i18n_utils::IsAscii(c)) {
+ c = std::tolower(c);
+ } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
+ c = ascii_char;
+ }
+ UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
+ if (c != normalized_c) {
+ return char_itr;
+ }
+ char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
+ normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
+ }
+ return char_itr;
+}
+
+CharacterIterator
+IcuNormalizer::TermTransformer::FindNormalizedNonLatinMatchEndPosition(
+ std::string_view term, CharacterIterator char_itr,
+ std::string_view normalized_term) const {
+ CharacterIterator normalized_char_itr(normalized_term);
+ UErrorCode status = U_ZERO_ERROR;
+
+ constexpr int kUtf16CharBufferLength = 6;
+ UChar c16[kUtf16CharBufferLength];
+ int32_t c16_length;
+ int32_t limit;
+
+ constexpr int kUtf32CharBufferLength = 3;
+ UChar32 normalized_buffer[kUtf32CharBufferLength];
+ int32_t c32_length;
+ while (char_itr.utf8_index() < term.length() &&
+ normalized_char_itr.utf8_index() < normalized_term.length()) {
+ UChar32 c = char_itr.GetCurrentChar();
+ u_strFromUTF32(c16, kUtf16CharBufferLength, &c16_length, &c,
+ /*srcLength=*/1, &status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+
+ limit = c16_length;
+ utrans_transUChars(u_transliterator_, c16, &c16_length,
+ kUtf16CharBufferLength,
+ /*start=*/0, &limit, &status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+
+ u_strToUTF32(normalized_buffer, kUtf32CharBufferLength, &c32_length, c16,
+ c16_length, &status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+
+ for (int i = 0; i < c32_length; ++i) {
+ UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
+ if (normalized_buffer[i] != normalized_c) {
+ return char_itr;
+ }
+ normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
+ }
+ char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
+ }
+ if (U_FAILURE(status)) {
+ // Failed to transform, return its original form.
+ ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
+ }
+ return char_itr;
+}
+
+CharacterIterator IcuNormalizer::FindNormalizedMatchEndPosition(
+ std::string_view term, std::string_view normalized_term) const {
+ UErrorCode status = U_ZERO_ERROR;
+ // ICU manages the singleton instance
+ const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
+ if (U_FAILURE(status)) {
+ ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
+ }
+
+ CharacterIterator char_itr(term);
+ UChar32 first_uchar32 = char_itr.GetCurrentChar();
+ if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
+ DiacriticCharToAscii(normalizer2, first_uchar32, /*char_out=*/nullptr)) {
+ return FindNormalizedLatinMatchEndPosition(normalizer2, term, char_itr,
+ normalized_term);
+ } else {
+ return term_transformer_->FindNormalizedNonLatinMatchEndPosition(
+ term, char_itr, normalized_term);
+ }
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h
index 4442f3b..d4f1ebd 100644
--- a/icing/transform/icu/icu-normalizer.h
+++ b/icing/transform/icu/icu-normalizer.h
@@ -21,6 +21,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
#include "unicode/unorm2.h"
#include "unicode/utrans.h"
@@ -39,8 +40,6 @@ namespace lib {
// details.
class IcuNormalizer : public Normalizer {
public:
- static constexpr std::string_view kName = "IcuNormalizer";
-
// Creates a normalizer with the subcomponents it needs. max_term_byte_size
// enforces the max size of text after normalization, text will be truncated
// if exceeds the max size.
@@ -58,6 +57,17 @@ class IcuNormalizer : public Normalizer {
// result in the non-Latin characters not properly being normalized
std::string NormalizeTerm(std::string_view term) const override;
+ // Returns a CharacterIterator pointing to one past the end of the segment of
+ // term that (once normalized) matches with normalized_term.
+ //
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
+ // CharacterIterator(u8:4, u16:4, u32:4).
+ //
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
+ // CharacterIterator(u8:0, u16:0, u32:0).
+ CharacterIterator FindNormalizedMatchEndPosition(
+ std::string_view term, std::string_view normalized_term) const override;
+
private:
// A handler class that helps manage the lifecycle of UTransliterator. It's
// used in IcuNormalizer to transform terms into the formats we need.
@@ -77,6 +87,12 @@ class IcuNormalizer : public Normalizer {
// Transforms the text based on our rules described at top of this file
std::string Transform(std::string_view term) const;
+ // Returns a CharacterIterator pointing to one past the end of the segment
+ // of a non-latin term that (once normalized) matches with normalized_term.
+ CharacterIterator FindNormalizedNonLatinMatchEndPosition(
+ std::string_view term, CharacterIterator char_itr,
+ std::string_view normalized_term) const;
+
private:
explicit TermTransformer(UTransliterator* u_transliterator);
diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index b037538..8d09be2 100644
--- a/icing/transform/icu/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -161,6 +161,124 @@ BENCHMARK(BM_NormalizeHiragana)
->Arg(2048000)
->Arg(4096000);
+void BM_UppercaseSubTokenLength(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string(state.range(0), 'A');
+ std::string normalized_input_string(state.range(0), 'a');
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_UppercaseSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_AccentSubTokenLength(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ std::string normalized_input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("àáâãā");
+ normalized_input_string.append("aaaaa");
+ }
+
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_AccentSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_HiraganaSubTokenLength(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ std::string normalized_input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("あいうえお");
+ normalized_input_string.append("アイウエオ");
+ }
+
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_HiraganaSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
} // namespace
} // namespace lib
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index f5d20ff..a46fcc7 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -231,6 +231,104 @@ TEST_F(IcuNormalizerTest, Truncate) {
}
}
+TEST_F(IcuNormalizerTest, PrefixMatchLength) {
+ // Verify that FindNormalizedMatchEndPosition will properly find the length of
+ // the prefix match when given a non-normalized term and a normalized term
+ // is a prefix of the non-normalized one.
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Upper to lower
+ std::string term = "MDI";
+ CharacterIterator match_end =
+ normalizer->FindNormalizedMatchEndPosition(term, "md");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD"));
+
+ term = "Icing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin"));
+
+ // Full-width
+ term = "525600";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "525");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
+
+ term = "FULLWIDTH";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "full");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
+
+ // Hiragana to Katakana
+ term = "あいうえお";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+ term = "かきくけこ";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+ // Latin accents
+ term = "Zürich";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+ term = "après-midi";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+ term = "Buenos días";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí"));
+}
+
+TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) {
+ // Verify that FindNormalizedMatchEndPosition will properly find the length of
+ // the prefix match when given a non-normalized term and a normalized term
+ // that share a common prefix.
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Upper to lower
+ std::string term = "MDI";
+ CharacterIterator match_end =
+ normalizer->FindNormalizedMatchEndPosition(term, "mgm");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M"));
+
+ term = "Icing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic"));
+
+ // Full-width
+ term = "525600";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
+
+ term = "FULLWIDTH";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
+
+ // Hiragana to Katakana
+ term = "あいうえお";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+ term = "かきくけこ";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+ // Latin accents
+ term = "Zürich";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+ term = "après-midi";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+ term = "días";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día"));
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/transform/map/map-normalizer-factory.cc b/icing/transform/map/map-normalizer-factory.cc
index 286b8f6..3bf84b3 100644
--- a/icing/transform/map/map-normalizer-factory.cc
+++ b/icing/transform/map/map-normalizer-factory.cc
@@ -42,8 +42,6 @@ libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
return std::make_unique<MapNormalizer>(max_term_byte_size);
}
-std::string_view GetNormalizerName() { return MapNormalizer::kName; }
-
} // namespace normalizer_factory
} // namespace lib
diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc
index 4ad5dec..95aa633 100644
--- a/icing/transform/map/map-normalizer.cc
+++ b/icing/transform/map/map-normalizer.cc
@@ -42,10 +42,16 @@ UChar32 NormalizeChar(UChar32 c) {
}
// The original character can be encoded into a single char16_t.
- const std::unordered_map<char16_t, char16_t>& normalization_map =
+ const std::unordered_map<char16_t, char16_t>* normalization_map =
GetNormalizationMap();
- auto iterator = normalization_map.find(static_cast<char16_t>(c));
- if (iterator == normalization_map.end()) {
+ if (normalization_map == nullptr) {
+ // Normalization map couldn't be properly initialized, append the original
+ // character.
+ ICING_LOG(WARNING) << "Unable to get a valid pointer to normalization map!";
+ return c;
+ }
+ auto iterator = normalization_map->find(static_cast<char16_t>(c));
+ if (iterator == normalization_map->end()) {
// Normalization mapping not found, append the original character.
return c;
}
@@ -99,7 +105,7 @@ std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
return normalized_text;
}
-CharacterIterator MapNormalizer::CalculateNormalizedMatchLength(
+CharacterIterator MapNormalizer::FindNormalizedMatchEndPosition(
std::string_view term, std::string_view normalized_term) const {
CharacterIterator char_itr(term);
CharacterIterator normalized_char_itr(normalized_term);
diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h
index 8fbe83b..ed996ae 100644
--- a/icing/transform/map/map-normalizer.h
+++ b/icing/transform/map/map-normalizer.h
@@ -26,8 +26,6 @@ namespace lib {
class MapNormalizer : public Normalizer {
public:
- static constexpr std::string_view kName = "MapNormalizer";
-
explicit MapNormalizer(int max_term_byte_size)
: max_term_byte_size_(max_term_byte_size){};
@@ -45,12 +43,12 @@ class MapNormalizer : public Normalizer {
// Returns a CharacterIterator pointing to one past the end of the segment of
// term that (once normalized) matches with normalized_term.
//
- // Ex. CalculateNormalizedMatchLength("YELLOW", "yell") will return
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
// CharacterIterator(u8:4, u16:4, u32:4).
//
- // Ex. CalculateNormalizedMatchLength("YELLOW", "red") will return
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
// CharacterIterator(u8:0, u16:0, u32:0).
- CharacterIterator CalculateNormalizedMatchLength(
+ CharacterIterator FindNormalizedMatchEndPosition(
std::string_view term, std::string_view normalized_term) const override;
private:
diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc
index 691afc6..8268541 100644
--- a/icing/transform/map/map-normalizer_benchmark.cc
+++ b/icing/transform/map/map-normalizer_benchmark.cc
@@ -143,6 +143,104 @@ BENCHMARK(BM_NormalizeHiragana)
->Arg(2048000)
->Arg(4096000);
+void BM_UppercaseSubTokenLength(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string(state.range(0), 'A');
+ std::string normalized_input_string(state.range(0), 'a');
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_UppercaseSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_AccentSubTokenLength(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ std::string normalized_input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("àáâãā");
+ normalized_input_string.append("aaaaa");
+ }
+
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_AccentSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_HiraganaSubTokenLength(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ std::string normalized_input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("あいうえお");
+ normalized_input_string.append("アイウエオ");
+ }
+
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_HiraganaSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
} // namespace
} // namespace lib
diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc
index 26fdd4a..adc5623 100644
--- a/icing/transform/map/map-normalizer_test.cc
+++ b/icing/transform/map/map-normalizer_test.cc
@@ -201,51 +201,103 @@ TEST(MapNormalizerTest, Truncate) {
}
TEST(MapNormalizerTest, PrefixMatchLength) {
+ // Verify that FindNormalizedMatchEndPosition will properly find the length of
+ // the prefix match when given a non-normalized term and a normalized term
+ // is a prefix of the non-normalized one.
ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
/*max_term_byte_size=*/1000));
// Upper to lower
std::string term = "MDI";
CharacterIterator match_end =
- normalizer->CalculateNormalizedMatchLength(term, "md");
+ normalizer->FindNormalizedMatchEndPosition(term, "md");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD"));
term = "Icing";
- match_end = normalizer->CalculateNormalizedMatchLength(term, "icin");
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin"));
// Full-width
term = "525600";
- match_end = normalizer->CalculateNormalizedMatchLength(term, "525");
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "525");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
term = "FULLWIDTH";
- match_end = normalizer->CalculateNormalizedMatchLength(term, "full");
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "full");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
// Hiragana to Katakana
term = "あいうえお";
- match_end = normalizer->CalculateNormalizedMatchLength(term, "アイ");
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
term = "かきくけこ";
- match_end = normalizer->CalculateNormalizedMatchLength(term, "カ");
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
// Latin accents
term = "Zürich";
- match_end = normalizer->CalculateNormalizedMatchLength(term, "zur");
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
term = "après-midi";
- match_end = normalizer->CalculateNormalizedMatchLength(term, "apre");
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
term = "Buenos días";
- match_end = normalizer->CalculateNormalizedMatchLength(term, "buenos di");
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di");
EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí"));
}
+TEST(MapNormalizerTest, SharedPrefixMatchLength) {
+ // Verify that FindNormalizedMatchEndPosition will properly find the length of
+ // the prefix match when given a non-normalized term and a normalized term
+ // that share a common prefix.
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Upper to lower
+ std::string term = "MDI";
+ CharacterIterator match_end =
+ normalizer->FindNormalizedMatchEndPosition(term, "mgm");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M"));
+
+ term = "Icing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic"));
+
+ // Full-width
+ term = "525600";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
+
+ term = "FULLWIDTH";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
+
+ // Hiragana to Katakana
+ term = "あいうえお";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+ term = "かきくけこ";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+ // Latin accents
+ term = "Zürich";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+ term = "après-midi";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+ term = "días";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día"));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc
index c318036..0994ab8 100644
--- a/icing/transform/map/normalization-map.cc
+++ b/icing/transform/map/normalization-map.cc
@@ -691,19 +691,21 @@ constexpr NormalizationPair kNormalizationMappings[] = {
} // namespace
-const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() {
+const std::unordered_map<char16_t, char16_t> *GetNormalizationMap() {
// The map is allocated dynamically the first time this function is executed.
- static const std::unordered_map<char16_t, char16_t> normalization_map = [] {
- std::unordered_map<char16_t, char16_t> map;
- // Size of all the mappings is about 2.5 KiB.
- constexpr int numMappings =
- sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
- map.reserve(numMappings);
- for (size_t i = 0; i < numMappings; ++i) {
- map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to);
- }
- return map;
- }();
+ static const std::unordered_map<char16_t, char16_t> *const normalization_map =
+ [] {
+ auto *map = new std::unordered_map<char16_t, char16_t>();
+ // Size of all the mappings is about 2.5 KiB.
+ constexpr int numMappings =
+ sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
+ map->reserve(numMappings);
+ for (size_t i = 0; i < numMappings; ++i) {
+ map->emplace(kNormalizationMappings[i].from,
+ kNormalizationMappings[i].to);
+ }
+ return map;
+ }();
return normalization_map;
}
diff --git a/icing/transform/map/normalization-map.h b/icing/transform/map/normalization-map.h
index aea85bd..ac7872b 100644
--- a/icing/transform/map/normalization-map.h
+++ b/icing/transform/map/normalization-map.h
@@ -23,7 +23,7 @@ namespace lib {
// Returns a map containing normalization mappings. A mapping (A -> B) means
// that we'll transform every character 'A' into 'B'. See normalization-map.cc
// for mapping details.
-const std::unordered_map<char16_t, char16_t>& GetNormalizationMap();
+const std::unordered_map<char16_t, char16_t>* GetNormalizationMap();
} // namespace lib
} // namespace icing
diff --git a/icing/transform/normalizer-factory.h b/icing/transform/normalizer-factory.h
index 1db9915..f1f3f62 100644
--- a/icing/transform/normalizer-factory.h
+++ b/icing/transform/normalizer-factory.h
@@ -36,9 +36,6 @@ namespace normalizer_factory {
libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
int max_term_byte_size);
-// Returns the name of the normalizer being used.
-std::string_view GetNormalizerName();
-
} // namespace normalizer_factory
} // namespace lib
diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h
index 7305c46..2110f0f 100644
--- a/icing/transform/normalizer.h
+++ b/icing/transform/normalizer.h
@@ -44,17 +44,13 @@ class Normalizer {
// Returns a CharacterIterator pointing to one past the end of the segment of
// term that (once normalized) matches with normalized_term.
//
- // Ex. CalculateNormalizedMatchLength("YELLOW", "yell") will return
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
// CharacterIterator(u8:4, u16:4, u32:4).
//
- // Ex. CalculateNormalizedMatchLength("YELLOW", "red") will return
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
// CharacterIterator(u8:0, u16:0, u32:0).
- virtual CharacterIterator CalculateNormalizedMatchLength(
- std::string_view term, std::string_view normalized_term) const {
- // TODO(b/195720764) Remove this default impl and implement in all
- // subclasses.
- return CharacterIterator(term, 0, 0, 0);
- }
+ virtual CharacterIterator FindNormalizedMatchEndPosition(
+ std::string_view term, std::string_view normalized_term) const = 0;
};
} // namespace lib
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index cd0a227..ec327ad 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -116,6 +116,8 @@ bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
+bool IsAlphaNumeric(UChar32 c) { return u_isalnum(c); }
+
int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index 82ae828..491df6b 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -67,6 +67,9 @@ bool IsAscii(char c);
// Checks if the Unicode char is within ASCII range.
bool IsAscii(UChar32 c);
+// Checks if the Unicode char is alphanumeric.
+bool IsAlphaNumeric(UChar32 c);
+
// Returns how many code units (char) are used for the UTF-8 encoding of this
// Unicode character. Returns 0 if not valid.
int GetUtf8Length(UChar32 c);
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index f0c066f..d57de81 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=390638574)
+set(synced_AOSP_CL_number=395331611)