diff options
author | Cassie Wang <cassiewang@google.com> | 2021-05-24 13:24:37 -0700 |
---|---|---|
committer | Cassie Wang <cassiewang@google.com> | 2021-05-25 11:31:27 -0700 |
commit | 5c1c71196b646c581c04fa4c70820d1a98aec33d (patch) | |
tree | 005e9f6c4779102d7c770412b2db10ef1d77154e | |
parent | ccb46eeda90523c6c86b9b5743e625a3e6d4417e (diff) | |
download | icing-5c1c71196b646c581c04fa4c70820d1a98aec33d.tar.gz |
Pull upstream changes.
Descriptions:
==========
Create a portable file backed proto log
==========
Fix bug that was causing us to not update the last_added_document_id of
the index
==========
Remove dependency on icu dat file.
==========
Create a simple dump utility that dumps all schemas, namespaces and
documents.
==========
Fix usage score handling for nonexistent documents.
==========
Allow querying in dumper.cc.
==========
Fix bit-util to actually set fields.
==========
Bug: 185804696
Bug: 189136429
Bug: 187879464
Change-Id: I7a8f7b223884a647dc0fe4d70de99d586a6b6e33
29 files changed, 2675 insertions, 631 deletions
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h new file mode 100644 index 0000000..95c3949 --- /dev/null +++ b/icing/file/portable-file-backed-proto-log.h @@ -0,0 +1,1173 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// File-backed log of protos with append-only writes and position based reads. +// +// There should only be one instance of a PortableFileBackedProtoLog of the same +// file at a time; using multiple instances at the same time may lead to +// undefined behavior. +// +// The entire checksum is computed on initialization to verify the contents are +// valid. On failure, the log will be truncated to the last verified state when +// PersistToDisk() was called. If the log cannot successfully restore the last +// state due to disk corruption or some other inconsistency, then the entire log +// will be lost. +// +// Each proto written to the file will have a metadata written just before it. +// The metadata consists of +// { +// 1 bytes of kProtoMagic; +// 3 bytes of the proto size +// n bytes of the proto itself +// } +// +// All metadata is written in a portable format, encoded with htonl before +// writing to file and decoded with ntohl when reading from file. +// +// Example usage: +// ICING_ASSERT_OK_AND_ASSIGN(auto create_result, +// PortableFileBackedProtoLog<DocumentProto>::Create(filesystem, +// file_path_, +// options)); +// auto proto_log = create_result.proto_log; +// +// Document document; +// document.set_namespace("com.google.android.example"); +// document.set_uri("www.google.com"); +// +// int64_t document_offset = proto_log->WriteProto(document)); +// Document same_document = proto_log->ReadProto(document_offset)); +// proto_log->PersistToDisk(); + +#ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_ +#define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_ + +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <memory> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include <google/protobuf/io/gzip_stream.h> +#include <google/protobuf/io/zero_copy_stream_impl_lite.h> +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/filesystem.h" +#include "icing/file/memory-mapped-file.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/portable/endian.h" +#include "icing/portable/platform.h" +#include "icing/portable/zlib.h" +#include "icing/util/bit-util.h" +#include "icing/util/crc32.h" +#include "icing/util/data-loss.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Number of bytes we reserve for the heading at the beginning of the proto log. +// We reserve this so the header can grow without running into the contents of +// the proto log, triggering an unnecessary migration of the data. +constexpr int kHeaderReservedBytes = 256; + +bool IsEmptyBuffer(const char* buffer, int size) { + return std::all_of(buffer, buffer + size, + [](const char byte) { return byte == 0; }); +} + +// Helper function to get stored proto size from the metadata. +// Metadata format: 8 bits magic + 24 bits size +int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } + +// Helper function to get stored proto magic from the metadata. +// Metadata format: 8 bits magic + 24 bits size +uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } + +} // namespace + +template <typename ProtoT> +class PortableFileBackedProtoLog { + public: + struct Options { + // Whether to compress each proto before writing to the proto log. + bool compress; + + // Byte-size limit for each proto written to the store. This does not + // include the bytes needed for the metadata of each proto. + // + // NOTE: Currently, we only support protos up to 16MiB. We store the proto + // size in 3 bytes within the metadata. + // + // NOTE: This limit is only enforced for future writes. If the store + // previously had a higher limit, then reading older entries could return + // larger protos. + // + // NOTE: The max_proto_size is the upper limit for input protos into the + // ProtoLog. Even if the proto is larger than max_proto_size, but compresses + // to a smaller size, ProtoLog will not accept it. Protos that result in a + // compressed size larger than max_proto_size are also not accepted. + const int32_t max_proto_size; + + // Must specify values for options. + Options() = delete; + explicit Options(bool compress_in, + const int32_t max_proto_size_in = kMaxProtoSize) + : compress(compress_in), max_proto_size(max_proto_size_in) {} + }; + + // Header stored at the beginning of the file before the rest of the log + // contents. Stores metadata on the log. + class Header { + public: + static constexpr int32_t kMagic = 0xf4c6f67a; + + static constexpr int32_t kFileFormatVersion = 0; + + uint32_t CalculateHeaderChecksum() const { + Crc32 crc; + + // Get a string_view of all the fields of the Header, excluding the + // magic_nbytes and header_checksum_nbytes + std::string_view header_str(reinterpret_cast<const char*>(this) + + offsetof(Header, header_checksum_nbytes) + + sizeof(header_checksum_nbytes), + sizeof(Header) - sizeof(magic_nbytes) - + sizeof(header_checksum_nbytes)); + crc.Append(header_str); + return crc.Get(); + } + + int32_t GetMagic() const { return gntohl(magic_nbytes); } + + void SetMagic(int32_t magic_in) { magic_nbytes = ghtonl(magic_in); } + + int32_t GetFileFormatVersion() const { + return gntohl(file_format_version_nbytes); + } + + void SetFileFormatVersion(int32_t file_format_version_in) { + file_format_version_nbytes = ghtonl(file_format_version_in); + } + + int32_t GetMaxProtoSize() const { return gntohl(max_proto_size_nbytes); } + + void SetMaxProtoSize(int32_t max_proto_size_in) { + max_proto_size_nbytes = ghtonl(max_proto_size_in); + } + + int32_t GetLogChecksum() const { return gntohl(log_checksum_nbytes); } + + void SetLogChecksum(int32_t log_checksum_in) { + log_checksum_nbytes = ghtonl(log_checksum_in); + } + + int64_t GetRewindOffset() const { return gntohll(rewind_offset_nbytes); } + + void SetRewindOffset(int64_t rewind_offset_in) { + rewind_offset_nbytes = ghtonll(rewind_offset_in); + } + + int32_t GetHeaderChecksum() const { return gntohl(header_checksum_nbytes); } + + void SetHeaderChecksum(int32_t header_checksum_in) { + header_checksum_nbytes = ghtonl(header_checksum_in); + } + + bool GetCompressFlag() const { + uint16_t host_order_flags = gntohs(flags_nbytes); + return bit_util::BitfieldGet(host_order_flags, kCompressBit, /*len=*/1); + } + + void SetCompressFlag(bool compress) { + uint16_t host_order_flags = gntohs(flags_nbytes); + bit_util::BitfieldSet(compress, kCompressBit, + /*len=*/1, &host_order_flags); + flags_nbytes = ghtons(host_order_flags); + } + + private: + // The least-significant bit offset at which the compress flag is stored in + // 'flags_nbytes'. Represents whether the protos in the log are compressed + // or not. + static constexpr int32_t kCompressBit = 0; + + // Holds the magic as a quick sanity check against file corruption. + // + // Field is in network-byte order. + int32_t magic_nbytes = ghtonl(kMagic); + + // Must be at the beginning after kMagic. Contains the crc checksum of + // the following fields. + // + // Field is in network-byte order. + uint32_t header_checksum_nbytes = 0; + + // Last known good offset at which the log and its checksum were updated. + // If we crash between writing to the log and updating the checksum, we can + // try to rewind the log to this offset and verify the checksum is still + // valid instead of throwing away the entire log. + // + // Field is in network-byte order. + int64_t rewind_offset_nbytes = ghtonll(kHeaderReservedBytes); + + // Version number tracking how we serialize the file to disk. If we change + // how/what we write to disk, this version should be updated and this class + // should handle a migration. + // + // Currently at kFileFormatVersion. + // + // Field is in network-byte order. + int32_t file_format_version_nbytes = 0; + + // The maximum proto size that can be written to the log. + // + // Field is in network-byte order. + int32_t max_proto_size_nbytes = 0; + + // Checksum of the log elements, doesn't include the header fields. + // + // Field is in network-byte order. + uint32_t log_checksum_nbytes = 0; + + // Bits are used to hold various flags. + // Lowest bit is whether the protos are compressed or not. + // + // Field is in network-byte order. + uint16_t flags_nbytes = 0; + + // NOTE: New fields should *almost always* be added to the end here. Since + // this class may have already been written to disk, appending fields + // increases the chances that changes are backwards-compatible. + }; + static_assert(sizeof(Header) <= kHeaderReservedBytes, + "Header has grown past our reserved bytes!"); + + struct CreateResult { + // A successfully initialized log. + std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log; + + // The data status after initializing from a previous state. Data loss can + // happen if the file is corrupted or some previously added data was + // unpersisted. This may be used to signal that any derived data off of the + // proto log may need to be regenerated. + DataLoss data_loss; + + bool has_data_loss() { + return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE; + } + }; + + // Factory method to create, initialize, and return a + // PortableFileBackedProtoLog. Will create the file if it doesn't exist. + // + // If on re-initialization the log detects disk corruption or some previously + // added data was unpersisted, the log will rewind to the last-good state. The + // log saves these checkpointed "good" states when PersistToDisk() is called + // or the log is safely destructed. If the log rewinds successfully to the + // last-good state, then the returned CreateResult.data_loss indicates + // whether it has a data loss and what kind of data loss it is (partial or + // complete) so that any derived data may know that it needs to be updated. If + // the log re-initializes successfully without any data loss, + // CreateResult.data_loss will be NONE. + // + // Params: + // filesystem: Handles system level calls + // file_path: Path of the underlying file. Directory of the file should + // already exist + // options: Configuration options for the proto log + // + // Returns: + // PortableFileBackedProtoLog::CreateResult on success + // INVALID_ARGUMENT on an invalid option + // INTERNAL_ERROR on IO error + static libtextclassifier3::StatusOr<CreateResult> Create( + const Filesystem* filesystem, const std::string& file_path, + const Options& options); + + // Not copyable + PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete; + PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) = + delete; + + // This will update the checksum of the log as well. + ~PortableFileBackedProtoLog(); + + // Writes the serialized proto to the underlying file. Writes are applied + // directly to the underlying file. Users do not need to sync the file after + // writing. + // + // Returns: + // Offset of the newly appended proto in file on success + // INVALID_ARGUMENT if proto is too large, as decided by + // Options.max_proto_size + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto); + + // Reads out a proto located at file_offset from the file. + // + // Returns: + // A proto on success + // NOT_FOUND if the proto at the given offset has been erased + // OUT_OF_RANGE_ERROR if file_offset exceeds file size + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const; + + // Erases the data of a proto located at file_offset from the file. + // + // Returns: + // OK on success + // OUT_OF_RANGE_ERROR if file_offset exceeds file size + // INTERNAL_ERROR on IO error + libtextclassifier3::Status EraseProto(int64_t file_offset); + + // Calculates and returns the disk usage in bytes. Rounds up to the nearest + // block size. + // + // Returns: + // Disk usage on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + + // Returns the file size of all the elements held in the log. File size is in + // bytes. This excludes the size of any internal metadata of the log, e.g. the + // log's header. + // + // Returns: + // File size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; + + // An iterator helping to find offsets of all the protos in file. + // Example usage: + // + // while (iterator.Advance().ok()) { + // int64_t offset = iterator.GetOffset(); + // // Do something + // } + class Iterator { + public: + Iterator(const Filesystem& filesystem, const std::string& file_path, + int64_t initial_offset); + + // Advances to the position of next proto whether it has been erased or not. + // + // Returns: + // OK on success + // OUT_OF_RANGE_ERROR if it reaches the end + // INTERNAL_ERROR on IO error + libtextclassifier3::Status Advance(); + + // Returns the file offset of current proto. + int64_t GetOffset(); + + private: + static constexpr int64_t kInvalidOffset = -1; + // Used to read proto metadata + MemoryMappedFile mmapped_file_; + // Offset of first proto + int64_t initial_offset_; + int64_t current_offset_; + int64_t file_size_; + }; + + // Returns an iterator of current proto log. The caller needs to keep the + // proto log unchanged while using the iterator, otherwise unexpected + // behaviors could happen. + Iterator GetIterator(); + + // Persists all changes since initialization or the last call to + // PersistToDisk(). Any changes that aren't persisted may be lost if the + // system fails to close safely. + // + // Example use case: + // + // Document document; + // document.set_namespace("com.google.android.example"); + // document.set_uri("www.google.com"); + // + // { + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem, + // file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // int64_t document_offset = proto_log->WriteProto(document)); + // + // // We lose the document here since it wasn't persisted. + // // *SYSTEM CRASH* + // } + // + // { + // // Can still successfully create after a crash since the log can + // // rewind/truncate to recover into a previously good state + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem, + // file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // // Lost the proto since we didn't PersistToDisk before the crash + // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error + // + // int64_t document_offset = proto_log->WriteProto(document)); + // + // // Persisted this time, so we should be ok. + // ICING_ASSERT_OK(proto_log->PersistToDisk()); + // } + // + // { + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem, + // file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // // SUCCESS + // Document same_document = proto_log->ReadProto(document_offset)); + // } + // + // NOTE: Since all protos are already written to the file directly, this + // just updates the checksum and rewind position. Without these updates, + // future initializations will truncate the file and discard unpersisted + // changes. + // + // Returns: + // OK on success + // INTERNAL_ERROR on IO error + libtextclassifier3::Status PersistToDisk(); + + // Calculates the checksum of the log contents. Excludes the header content. + // + // Returns: + // Crc of the log content + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); + + private: + // Object can only be instantiated via the ::Create factory. + PortableFileBackedProtoLog(const Filesystem* filesystem, + const std::string& file_path, + std::unique_ptr<Header> header); + + // Initializes a new proto log. + // + // Returns: + // std::unique_ptr<CreateResult> on success + // INTERNAL_ERROR on IO error + static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile( + const Filesystem* filesystem, const std::string& file_path, + const Options& options); + + // Verifies that the existing proto log is in a good state. If not in a good + // state, then the proto log may be truncated to the last good state and + // content will be lost. + // + // Returns: + // std::unique_ptr<CreateResult> on success + // INTERNAL_ERROR on IO error or internal inconsistencies in the file + // INVALID_ARGUMENT_ERROR if options aren't consistent with previous + // instances + static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile( + const Filesystem* filesystem, const std::string& file_path, + const Options& options, int64_t file_size); + + // Takes an initial checksum and updates it with the content between `start` + // and `end` offsets in the file. + // + // Returns: + // Crc of the content between `start`, inclusive, and `end`, exclusive. + // INTERNAL_ERROR on IO error + // INVALID_ARGUMENT_ERROR if start and end aren't within the file size + static libtextclassifier3::StatusOr<Crc32> ComputeChecksum( + const Filesystem* filesystem, const std::string& file_path, + Crc32 initial_crc, int64_t start, int64_t end); + + // Reads out the metadata of a proto located at file_offset from the file. + // Metadata will be returned in host byte order endianness. + // + // Returns: + // Proto's metadata on success + // OUT_OF_RANGE_ERROR if file_offset exceeds file_size + // INTERNAL_ERROR if the metadata is invalid or any IO errors happen + static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata( + MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); + + // Writes metadata of a proto to the fd. Takes in a host byte order endianness + // metadata and converts it into a portable metadata before writing. + // + // Returns: + // OK on success + // INTERNAL_ERROR on any IO errors + static libtextclassifier3::Status WriteProtoMetadata( + const Filesystem* filesystem, int fd, int32_t host_order_metadata); + + // Magic number added in front of every proto. Used when reading out protos + // as a first check for corruption in each entry in the file. Even if there is + // a corruption, the best we can do is roll back to our last recovery point + // and throw away un-flushed data. We can discard/reuse this byte if needed so + // that we have 4 bytes to store the size of protos, and increase the size of + // protos we support. + static constexpr uint8_t kProtoMagic = 0x5C; + + // Our internal max for protos. + // + // WARNING: Changing this to a larger number may invalidate our assumption + // that that proto size can safely be stored in the last 3 bytes of the proto + // header. + static constexpr int kMaxProtoSize = (1 << 24) - 1; // 16MiB + static_assert(kMaxProtoSize <= 0x00FFFFFF, + "kMaxProtoSize doesn't fit in 3 bytes"); + + // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9 + static constexpr int kDeflateCompressionLevel = 3; + + // Chunks of the file to mmap at a time, so we don't mmap the entire file. + // Only used on 32-bit devices + static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB + + ScopedFd fd_; + const Filesystem* const filesystem_; + const std::string file_path_; + std::unique_ptr<Header> header_; +}; + +template <typename ProtoT> +constexpr uint8_t PortableFileBackedProtoLog<ProtoT>::kProtoMagic; + +template <typename ProtoT> +PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog( + const Filesystem* filesystem, const std::string& file_path, + std::unique_ptr<Header> header) + : filesystem_(filesystem), + file_path_(file_path), + header_(std::move(header)) { + fd_.reset(filesystem_->OpenForAppend(file_path.c_str())); +} + +template <typename ProtoT> +PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() { + if (!PersistToDisk().ok()) { + ICING_LOG(WARNING) << "Error persisting to disk during destruction of " + "PortableFileBackedProtoLog: " + << file_path_; + } +} + +template <typename ProtoT> +libtextclassifier3::StatusOr< + typename PortableFileBackedProtoLog<ProtoT>::CreateResult> +PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem, + const std::string& file_path, + const Options& options) { + if (options.max_proto_size <= 0) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "options.max_proto_size must be greater than 0, was %d", + options.max_proto_size)); + } + + // Since we store the proto_size in 3 bytes, we can only support protos of up + // to 16MiB. + if (options.max_proto_size > kMaxProtoSize) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "options.max_proto_size must be under 16MiB, was %d", + options.max_proto_size)); + } + + if (!filesystem->FileExists(file_path.c_str())) { + return InitializeNewFile(filesystem, file_path, options); + } + + int64_t file_size = filesystem->GetFileSize(file_path.c_str()); + if (file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + absl_ports::StrCat("Bad file size '", file_path, "'")); + } + + if (file_size == 0) { + return InitializeNewFile(filesystem, file_path, options); + } + + return InitializeExistingFile(filesystem, file_path, options, file_size); +} + +template <typename ProtoT> +libtextclassifier3::StatusOr< + typename PortableFileBackedProtoLog<ProtoT>::CreateResult> +PortableFileBackedProtoLog<ProtoT>::InitializeNewFile( + const Filesystem* filesystem, const std::string& file_path, + const Options& options) { + // Grow to the minimum reserved bytes for the header. + if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to initialize file size: ", file_path)); + } + + // Create the header + std::unique_ptr<Header> header = std::make_unique<Header>(); + header->SetCompressFlag(options.compress); + header->SetMaxProtoSize(options.max_proto_size); + header->SetHeaderChecksum(header->CalculateHeaderChecksum()); + + if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write header for file: ", file_path)); + } + + CreateResult create_result = { + std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>( + new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path, + std::move(header))), + /*data_loss=*/DataLoss::NONE}; + + return create_result; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr< + typename PortableFileBackedProtoLog<ProtoT>::CreateResult> +PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile( + const Filesystem* filesystem, const std::string& file_path, + const Options& options, int64_t file_size) { + if (file_size < kHeaderReservedBytes) { + return absl_ports::InternalError( + absl_ports::StrCat("File header too short for: ", file_path)); + } + + std::unique_ptr<Header> header = std::make_unique<Header>(); + if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header), + /*offset=*/0)) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to read header for file: ", file_path)); + } + + // Make sure the header is still valid before we use any of its values. This + // is covered by the header_checksum check below, but this is a quick check + // that can save us from an extra crc computation. + if (header->GetMagic() != Header::kMagic) { + return absl_ports::InternalError( + absl_ports::StrCat("Invalid header kMagic for file: ", file_path)); + } + + if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) { + return absl_ports::InternalError( + absl_ports::StrCat("Invalid header checksum for: ", file_path)); + } + + if (header->GetFileFormatVersion() != Header::kFileFormatVersion) { + // If this changes, we might need to handle a migration rather than throwing + // an error. + return absl_ports::InternalError( + absl_ports::StrCat("Invalid header file format version: ", file_path)); + } + + if (header->GetCompressFlag() != options.compress) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Inconsistent compress option, expected %d, actual %d", + header->GetCompressFlag(), options.compress)); + } + + if (header->GetMaxProtoSize() > options.max_proto_size) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Max proto size cannot be smaller than previous " + "instantiations, previous size %d, wanted size %d", + header->GetMaxProtoSize(), options.max_proto_size)); + } + header->SetMaxProtoSize(options.max_proto_size); + + DataLoss data_loss = DataLoss::NONE; + ICING_ASSIGN_OR_RETURN( + Crc32 calculated_log_checksum, + ComputeChecksum(filesystem, file_path, Crc32(), + /*start=*/kHeaderReservedBytes, /*end=*/file_size)); + + // Double check that the log checksum is the same as the one that was + // persisted last time. If not, we start recovery logic. + if (header->GetLogChecksum() != calculated_log_checksum.Get()) { + // Need to rewind the proto log since the checksums don't match. + // Worst case, we have to rewind the entire log back to just the header + int64_t last_known_good = kHeaderReservedBytes; + + // Calculate the checksum of the log contents just up to the last rewind + // offset point. This will be valid if we just appended contents to the log + // without updating the checksum, and we can rewind back to this point + // safely. + ICING_ASSIGN_OR_RETURN(calculated_log_checksum, + ComputeChecksum(filesystem, file_path, Crc32(), + /*start=*/kHeaderReservedBytes, + /*end=*/header->GetRewindOffset())); + if (header->GetLogChecksum() == calculated_log_checksum.Get()) { + // Check if it matches our last rewind state. If so, this becomes our last + // good state and we can safely truncate and recover from here. + last_known_good = header->GetRewindOffset(); + data_loss = DataLoss::PARTIAL; + } else { + // Otherwise, we're going to truncate the entire log and this resets the + // checksum to an empty log state. + header->SetLogChecksum(0); + data_loss = DataLoss::COMPLETE; + } + + if (!filesystem->Truncate(file_path.c_str(), last_known_good)) { + return absl_ports::InternalError( + absl_ports::StrCat("Error truncating file: ", file_path)); + } + + ICING_LOG(INFO) << "Truncated '" << file_path << "' to size " + << last_known_good; + } + + CreateResult create_result = { + std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>( + new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path, + std::move(header))), + data_loss}; + + return create_result; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<Crc32> +PortableFileBackedProtoLog<ProtoT>::ComputeChecksum( + const Filesystem* filesystem, const std::string& file_path, + Crc32 initial_crc, int64_t start, int64_t end) { + auto mmapped_file = MemoryMappedFile(*filesystem, file_path, + MemoryMappedFile::Strategy::READ_ONLY); + Crc32 new_crc(initial_crc.Get()); + + if (start < 0) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Starting checksum offset of file '%s' must be greater than 0, was " + "%lld", + file_path.c_str(), static_cast<long long>(start))); + } + + if (end < start) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Ending checksum offset of file '%s' must be greater than start " + "'%lld', was '%lld'", + file_path.c_str(), static_cast<long long>(start), + static_cast<long long>(end))); + } + + int64_t file_size = filesystem->GetFileSize(file_path.c_str()); + if (end > file_size) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Ending checksum offset of file '%s' must be within " + "file size of %lld, was %lld", + file_path.c_str(), static_cast<long long>(file_size), + static_cast<long long>(end))); + } + + Architecture architecture = GetArchitecture(); + switch (architecture) { + case Architecture::BIT_64: { + // Don't mmap in chunks here since mmapping can be harmful on 64-bit + // devices where mmap/munmap calls need the mmap write semaphore, which + // blocks mmap/munmap/mprotect and all page faults from executing while + // they run. On 64-bit devices, this doesn't actually load into memory, it + // just makes the file faultable. So the whole file should be ok. + // b/185822878. + ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start)); + auto mmap_str = std::string_view(mmapped_file.region(), end - start); + new_crc.Append(mmap_str); + break; + } + case Architecture::BIT_32: + [[fallthrough]]; + case Architecture::UNKNOWN: { + // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too + // much memory at once. If we're unknown, then also chunk it because we're + // not sure what the device can handle. + for (int i = start; i < end; i += kMmapChunkSize) { + // Don't read past the file size. + int next_chunk_size = kMmapChunkSize; + if ((i + kMmapChunkSize) >= end) { + next_chunk_size = end - i; + } + + ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size)); + + auto mmap_str = + std::string_view(mmapped_file.region(), next_chunk_size); + new_crc.Append(mmap_str); + } + break; + } + } + + return new_crc; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> +PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) { + int64_t proto_size = proto.ByteSizeLong(); + int32_t host_order_metadata; + int64_t current_position = filesystem_->GetCurrentPosition(fd_.get()); + + if (proto_size > header_->GetMaxProtoSize()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "proto_size, %lld, was too large to write. Max is %d", + static_cast<long long>(proto_size), header_->GetMaxProtoSize())); + } + + // At this point, we've guaranteed that proto_size is under kMaxProtoSize + // (see + // ::Create), so we can safely store it in an int. + int final_size = 0; + + std::string proto_str; + google::protobuf::io::StringOutputStream proto_stream(&proto_str); + + if (header_->GetCompressFlag()) { + google::protobuf::io::GzipOutputStream::Options options; + options.format = google::protobuf::io::GzipOutputStream::ZLIB; + options.compression_level = kDeflateCompressionLevel; + + google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream, + options); + + bool success = proto.SerializeToZeroCopyStream(&compressing_stream) && + compressing_stream.Close(); + + if (!success) { + return absl_ports::InternalError("Error compressing proto."); + } + + final_size = proto_str.size(); + + // In case the compressed proto is larger than the original proto, we also + // can't write it. + if (final_size > header_->GetMaxProtoSize()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Compressed proto size, %d, was greater than " + "max_proto_size, %d", + final_size, header_->GetMaxProtoSize())); + } + } else { + // Serialize the proto directly into the write buffer at an offset of the + // metadata. + proto.SerializeToZeroCopyStream(&proto_stream); + final_size = proto_str.size(); + } + + // 1st byte for magic, next 3 bytes for proto size. + host_order_metadata = (kProtoMagic << 24) | final_size; + + // Actually write metadata, has to be done after we know the possibly + // compressed proto size + ICING_RETURN_IF_ERROR( + WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata)); + + // Write the serialized proto + if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write proto to: ", file_path_)); + } + + return current_position; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<ProtoT> +PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const { + int64_t file_size = filesystem_->GetFileSize(fd_.get()); + MemoryMappedFile mmapped_file(*filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_ONLY); + if (file_offset >= file_size) { + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + + // Read out the metadata + ICING_ASSIGN_OR_RETURN( + int32_t metadata, + ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + + // Copy out however many bytes it says the proto is + int stored_size = GetProtoSize(metadata); + + ICING_RETURN_IF_ERROR( + mmapped_file.Remap(file_offset + sizeof(metadata), stored_size)); + + if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) { + return absl_ports::NotFoundError("The proto data has been erased."); + } + + google::protobuf::io::ArrayInputStream proto_stream( + mmapped_file.mutable_region(), stored_size); + + // Deserialize proto + ProtoT proto; + if (header_->GetCompressFlag()) { + google::protobuf::io::GzipInputStream decompress_stream(&proto_stream); + proto.ParseFromZeroCopyStream(&decompress_stream); + } else { + proto.ParseFromZeroCopyStream(&proto_stream); + } + + return proto; +} + +template <typename ProtoT> +libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto( + int64_t file_offset) { + int64_t file_size = filesystem_->GetFileSize(fd_.get()); + if (file_offset >= file_size) { + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Trying to erase data at a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + + MemoryMappedFile mmapped_file( + *filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); + + // Read out the metadata + ICING_ASSIGN_OR_RETURN( + int32_t metadata, + ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + + ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), + GetProtoSize(metadata))); + + // We need to update the crc checksum if the erased area is before the + // rewind position. + if (file_offset + sizeof(metadata) < header_->GetRewindOffset()) { + // We need to calculate [original string xor 0s]. + // The xored string is the same as the original string because 0 xor 0 = + // 0, 1 xor 0 = 1. + const std::string_view xored_str(mmapped_file.region(), + mmapped_file.region_size()); + + Crc32 crc(header_->GetLogChecksum()); + ICING_ASSIGN_OR_RETURN( + uint32_t new_crc, + crc.UpdateWithXor(xored_str, + /*full_data_size=*/header_->GetRewindOffset() - + kHeaderReservedBytes, + /*position=*/file_offset + sizeof(metadata) - + kHeaderReservedBytes)); + + header_->SetLogChecksum(new_crc); + header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); + + if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), + sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to update header to: ", file_path_)); + } + } + + memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size()); + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> +PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const { + int64_t size = filesystem_->GetDiskUsage(file_path_.c_str()); + if (size == Filesystem::kBadFileSize) { + return absl_ports::InternalError("Failed to get disk usage of proto log"); + } + return size; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> +PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const { + int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (total_file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + "Failed to get file size of elments in the proto log"); + } + return total_file_size - kHeaderReservedBytes; +} + +template <typename ProtoT> +PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator( + const Filesystem& filesystem, const std::string& file_path, + int64_t initial_offset) + : mmapped_file_(filesystem, file_path, + MemoryMappedFile::Strategy::READ_ONLY), + initial_offset_(initial_offset), + current_offset_(kInvalidOffset), + file_size_(filesystem.GetFileSize(file_path.c_str())) { + if (file_size_ == Filesystem::kBadFileSize) { + // Fails all Advance() calls + file_size_ = 0; + } +} + +template <typename ProtoT> +libtextclassifier3::Status +PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() { + if (current_offset_ == kInvalidOffset) { + // First Advance() call + current_offset_ = initial_offset_; + } else { + // Jumps to the next proto position + ICING_ASSIGN_OR_RETURN( + int32_t metadata, + ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_)); + current_offset_ += sizeof(metadata) + GetProtoSize(metadata); + } + + if (current_offset_ < file_size_) { + return libtextclassifier3::Status::OK; + } else { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "The next proto offset, %lld, is out of file range [0, %lld)", + static_cast<long long>(current_offset_), + static_cast<long long>(file_size_))); + } +} + +template <typename ProtoT> +int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() { + return current_offset_; +} + +template <typename ProtoT> +typename PortableFileBackedProtoLog<ProtoT>::Iterator +PortableFileBackedProtoLog<ProtoT>::GetIterator() { + return Iterator(*filesystem_, file_path_, + /*initial_offset=*/kHeaderReservedBytes); +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int32_t> +PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata( + MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) { + // Checks file_offset + if (file_offset >= file_size) { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "offset, %lld, is out of file range [0, %lld)", + static_cast<long long>(file_offset), + static_cast<long long>(file_size))); + } + int32_t portable_metadata; + int metadata_size = sizeof(portable_metadata); + if (file_offset + metadata_size >= file_size) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Wrong metadata offset %lld, metadata doesn't fit in " + "with file range [0, %lld)", + static_cast<long long>(file_offset), + static_cast<long long>(file_size))); + } + + // Reads metadata + ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size)); + memcpy(&portable_metadata, mmapped_file->region(), metadata_size); + + // Need to switch it back to host order endianness after reading from disk. + int32_t host_order_metadata = gntohl(portable_metadata); + + // Checks magic number + uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata); + if (stored_k_proto_magic != kProtoMagic) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic, + stored_k_proto_magic)); + } + + return host_order_metadata; +} + +template <typename ProtoT> +libtextclassifier3::Status +PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata( + const Filesystem* filesystem, int fd, int32_t host_order_metadata) { + // Convert it into portable endian format before writing to disk + int32_t portable_metadata = ghtonl(host_order_metadata); + int portable_metadata_size = sizeof(portable_metadata); + + // Write metadata + if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write proto metadata.")); + } + + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> +libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() { + int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (file_size == header_->GetRewindOffset()) { + // No new protos appended, don't need to update the checksum. + return libtextclassifier3::Status::OK; + } + + int64_t new_content_size = file_size - header_->GetRewindOffset(); + Crc32 crc; + if (new_content_size < 0) { + // File shrunk, recalculate the entire checksum. + ICING_ASSIGN_OR_RETURN( + crc, + ComputeChecksum(filesystem_, file_path_, Crc32(), + /*start=*/kHeaderReservedBytes, /*end=*/file_size)); + } else { + // Append new changes to the existing checksum. + ICING_ASSIGN_OR_RETURN( + crc, ComputeChecksum(filesystem_, file_path_, + Crc32(header_->GetLogChecksum()), + header_->GetRewindOffset(), file_size)); + } + + header_->SetLogChecksum(crc.Get()); + header_->SetRewindOffset(file_size); + header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); + + if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), + sizeof(Header)) || + !filesystem_->DataSync(fd_.get())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to update header to: ", file_path_)); + } + + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<Crc32> +PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() { + return PortableFileBackedProtoLog<ProtoT>::ComputeChecksum( + filesystem_, file_path_, Crc32(), /*start=*/kHeaderReservedBytes, + /*end=*/filesystem_->GetFileSize(file_path_.c_str())); +} + +} // namespace lib +} // namespace icing + +#endif // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_ diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc new file mode 100644 index 0000000..b1dfe12 --- /dev/null +++ b/icing/file/portable-file-backed-proto-log_benchmark.cc @@ -0,0 +1,211 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cstdint> +#include <random> + +#include "testing/base/public/benchmark.h" +#include "gmock/gmock.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/proto/document.pb.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/random-string.h" +#include "icing/testing/tmp-directory.h" + +// go/microbenchmarks +// +// To build and run on a local machine: +// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt +// icing/file:portable-file-backed-proto-log_benchmark +// +// $ blaze-bin/icing/file/portable-file-backed-proto-log_benchmark +// --benchmarks=all +// +// +// To build and run on an Android device (must be connected and rooted): +// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" +// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt +// icing/file:portable-file-backed-proto-log_benchmark +// +// $ adb root +// +// $ adb push +// blaze-bin/icing/file/portable-file-backed-proto-log_benchmark +// /data/local/tmp/ +// +// $ adb shell /data/local/tmp/portable-file-backed-proto-log-benchmark +// --benchmarks=all + +namespace icing { +namespace lib { + +namespace { + +static void BM_Write(benchmark::State& state) { + const Filesystem filesystem; + int string_length = state.range(0); + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->WriteProto(document)); + } + state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * + string_length); + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Write) + ->Arg(1) + ->Arg(32) + ->Arg(512) + ->Arg(1024) + ->Arg(4 * 1024) + ->Arg(8 * 1024) + ->Arg(16 * 1024) + ->Arg(32 * 1024) + ->Arg(256 * 1024) + ->Arg(2 * 1024 * 1024) + ->Arg(8 * 1024 * 1024) + ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is + // 16MiB, and we need some extra space for the + // rest of the document properties + +static void BM_Read(benchmark::State& state) { + const Filesystem filesystem; + int string_length = state.range(0); + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, + proto_log->WriteProto(document)); + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ReadProto(write_offset)); + } + state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * + string_length); + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Read) + ->Arg(1) + ->Arg(32) + ->Arg(512) + ->Arg(1024) + ->Arg(4 * 1024) + ->Arg(8 * 1024) + ->Arg(16 * 1024) + ->Arg(32 * 1024) + ->Arg(256 * 1024) + ->Arg(2 * 1024 * 1024) + ->Arg(8 * 1024 * 1024) + ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is + // 16MiB, and we need some extra space for the + // rest of the document properties + +static void BM_ComputeChecksum(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = GetTestTempDir() + "/proto.log"; + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Make each document 1KiB + int string_length = 1024; + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + int num_docs = state.range(0); + for (int i = 0; i < num_docs; ++i) { + ICING_ASSERT_OK(proto_log->WriteProto(document)); + } + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ComputeChecksum()); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20); + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc new file mode 100644 index 0000000..dfb67aa --- /dev/null +++ b/icing/file/portable-file-backed-proto-log_test.cc @@ -0,0 +1,727 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/file/portable-file-backed-proto-log.h" + +#include <cstdint> +#include <cstdlib> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/mock-filesystem.h" +#include "icing/portable/equals-proto.h" +#include "icing/proto/document.pb.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::icing::lib::portable_equals_proto::EqualsProto; +using ::testing::A; +using ::testing::Eq; +using ::testing::Gt; +using ::testing::HasSubstr; +using ::testing::Not; +using ::testing::NotNull; +using ::testing::Pair; +using ::testing::Return; + +class PortableFileBackedProtoLogTest : public ::testing::Test { + protected: + // Adds a user-defined default construct because a const member variable may + // make the compiler accidentally delete the default constructor. + // https://stackoverflow.com/a/47368753 + PortableFileBackedProtoLogTest() {} + + void SetUp() override { + file_path_ = GetTestTempDir() + "/proto_log"; + filesystem_.DeleteFile(file_path_.c_str()); + } + + void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); } + + const Filesystem filesystem_; + std::string file_path_; + bool compress_ = true; + int64_t max_proto_size_ = 256 * 1024; // 256 KiB +}; + +TEST_F(PortableFileBackedProtoLogTest, Initialize) { + // max_proto_size must be greater than 0 + int invalid_max_proto_size = 0; + ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, invalid_max_proto_size)), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + EXPECT_THAT(create_result.proto_log, NotNull()); + EXPECT_FALSE(create_result.has_data_loss()); + + // Can't recreate the same file with different options. + ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + !compress_, max_proto_size_)), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(PortableFileBackedProtoLogTest, ReservedSpaceForHeader) { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + + // With no protos written yet, the log should be minimum the size of the + // reserved header space. + ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), kHeaderReservedBytes); +} + +TEST_F(PortableFileBackedProtoLogTest, WriteProtoTooLarge) { + int max_proto_size = 1; + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Proto is too large for the max_proto_size_in + ASSERT_THAT(proto_log->WriteProto(document), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(PortableFileBackedProtoLogTest, ReadProtoWrongKProtoMagic) { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a proto + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset, + proto_log->WriteProto(document)); + + // The 4 bytes of metadata that just doesn't have the same kProtoMagic + // specified in file-backed-proto-log.h + uint32_t wrong_magic = 0x7E000000; + + // Sanity check that we opened the file correctly + int fd = filesystem_.OpenForWrite(file_path_.c_str()); + ASSERT_GT(fd, 0); + + // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of + // a proto entry. + filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic)); + + ASSERT_THAT(proto_log->ReadProto(file_offset), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(PortableFileBackedProtoLogTest, ReadWriteUncompressedProto) { + int last_offset; + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/false, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write the first proto + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int written_position, + proto_log->WriteProto(document1)); + + int document1_offset = written_position; + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document1))); + + // Write a second proto that's close to the max size. Leave some room for + // the rest of the proto properties. + std::string long_str(max_proto_size_ - 1024, 'a'); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .AddStringProperty("long_str", long_str) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(written_position, + proto_log->WriteProto(document2)); + + int document2_offset = written_position; + last_offset = written_position; + ASSERT_GT(document2_offset, document1_offset); + + // Check the second proto + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document2))); + + ICING_ASSERT_OK(proto_log->PersistToDisk()); + } + + { + // Make a new proto_log with the same file_path, and make sure we + // can still write to the same underlying file. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/false, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a third proto + DocumentProto document3 = + DocumentBuilder().SetKey("namespace3", "uri3").Build(); + + ASSERT_THAT(recreated_proto_log->WriteProto(document3), + IsOkAndHolds(Gt(last_offset))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, ReadWriteCompressedProto) { + int last_offset; + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/true, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write the first proto + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int written_position, + proto_log->WriteProto(document1)); + + int document1_offset = written_position; + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document1))); + + // Write a second proto that's close to the max size. Leave some room for + // the rest of the proto properties. + std::string long_str(max_proto_size_ - 1024, 'a'); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .AddStringProperty("long_str", long_str) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(written_position, + proto_log->WriteProto(document2)); + + int document2_offset = written_position; + last_offset = written_position; + ASSERT_GT(document2_offset, document1_offset); + + // Check the second proto + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document2))); + + ICING_ASSERT_OK(proto_log->PersistToDisk()); + } + + { + // Make a new proto_log with the same file_path, and make sure we + // can still write to the same underlying file. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/true, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a third proto + DocumentProto document3 = + DocumentBuilder().SetKey("namespace3", "uri3").Build(); + + ASSERT_THAT(recreated_proto_log->WriteProto(document3), + IsOkAndHolds(Gt(last_offset))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, CorruptHeader) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + } + + int corrupt_value = 24; + + // Offset after the kMagic and the header_checksum. + int offset_after_checksum = 8; + filesystem_.PWrite(file_path_.c_str(), offset_after_checksum, &corrupt_value, + sizeof(corrupt_value)); + + { + // Reinitialize the same proto_log + ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_)), + StatusIs(libtextclassifier3::StatusCode::INTERNAL, + HasSubstr("Invalid header checksum"))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, DifferentMagic) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + + // Corrupt the magic that's stored at the beginning of the header. + int invalid_magic = -1; + filesystem_.PWrite(file_path_.c_str(), /*offset=*/0, &invalid_magic, + sizeof(invalid_magic)); + } + + { + // Reinitialize the same proto_log + ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_)), + StatusIs(libtextclassifier3::StatusCode::INTERNAL, + HasSubstr("Invalid header kMagic"))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, CorruptContent) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + + DocumentProto document = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + // Write and persist an document. + ICING_ASSERT_OK_AND_ASSIGN(int document_offset, + proto_log->WriteProto(document)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // "Corrupt" the content written in the log. + document.set_uri("invalid"); + std::string serialized_document = document.SerializeAsString(); + filesystem_.PWrite(file_path_.c_str(), document_offset, + serialized_document.data(), serialized_document.size()); + } + + { + // We can recover, but we have data loss. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_TRUE(create_result.has_data_loss()); + ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); + + // Lost everything in the log since the rewind position doesn't help if + // there's been data corruption within the persisted region + ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), + kHeaderReservedBytes); + } +} + +TEST_F(PortableFileBackedProtoLogTest, PersistToDisk) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace2", "uri2").Build(); + int document1_offset, document2_offset; + int log_size; + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write and persist the first proto + ICING_ASSERT_OK_AND_ASSIGN(document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // Write, but don't explicitly persist the second proto + ICING_ASSERT_OK_AND_ASSIGN(document2_offset, + proto_log->WriteProto(document2)); + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(document1_offset), + IsOkAndHolds(EqualsProto(document1))); + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); + + log_size = filesystem_.GetFileSize(file_path_.c_str()); + ASSERT_GT(log_size, 0); + } + + { + // The header rewind position and checksum aren't updated in this "system + // crash" scenario. + + std::string bad_proto = + "some incomplete proto that we didn't finish writing before the " + "system crashed"; + filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(), + bad_proto.size()); + + // Double check that we actually wrote something to the underlying file + ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size); + } + + { + // We can recover, but we have data loss + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_TRUE(create_result.has_data_loss()); + ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL)); + + // Check that everything was persisted across instances + ASSERT_THAT(proto_log->ReadProto(document1_offset), + IsOkAndHolds(EqualsProto(document1))); + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); + + // We correctly rewound to the last good state. + ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str())); + } +} + +TEST_F(PortableFileBackedProtoLogTest, Iterator) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + { + // Empty iterator + auto iterator = proto_log->GetIterator(); + ASSERT_THAT(iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } + + { + // Iterates through some documents + ICING_ASSERT_OK(proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->WriteProto(document2)); + auto iterator = proto_log->GetIterator(); + // 1st proto + ICING_ASSERT_OK(iterator.Advance()); + ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()), + IsOkAndHolds(EqualsProto(document1))); + // 2nd proto + ICING_ASSERT_OK(iterator.Advance()); + ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()), + IsOkAndHolds(EqualsProto(document2))); + // Tries to advance + ASSERT_THAT(iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } + + { + // Iterator with bad filesystem + MockFilesystem mock_filesystem; + ON_CALL(mock_filesystem, GetFileSize(A<const char *>())) + .WillByDefault(Return(Filesystem::kBadFileSize)); + PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator( + mock_filesystem, file_path_, /*initial_offset=*/0); + ASSERT_THAT(bad_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } +} + +TEST_F(PortableFileBackedProtoLogTest, ComputeChecksum) { + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + Crc32 checksum; + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + ICING_EXPECT_OK(proto_log->WriteProto(document)); + + ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum()); + + // Calling it twice with no changes should get us the same checksum + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Checksum should be consistent across instances + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + + // PersistToDisk shouldn't affect the checksum value + ICING_EXPECT_OK(proto_log->PersistToDisk()); + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + + // Check that modifying the log leads to a different checksum + ICING_EXPECT_OK(proto_log->WriteProto(document)); + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum)))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldSetZero) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes and erases proto + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + // Checks if the erased area is set to 0. + int64_t file_size = filesystem_.GetFileSize(file_path_.c_str()); + MemoryMappedFile mmapped_file(filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_ONLY); + + // document1_offset + sizeof(int) is the start byte of the proto where + // sizeof(int) is the size of the proto metadata. + mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1); + for (size_t i = 0; i < mmapped_file.region_size(); ++i) { + ASSERT_THAT(mmapped_file.region()[i], Eq(0)); + } +} + +TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldReturnNotFound) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes 2 protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset, + proto_log->WriteProto(document2)); + + // Erases the first proto + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + // The first proto has been erased. + ASSERT_THAT(proto_log->ReadProto(document1_offset), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + // The second proto should be returned. + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); +} + +TEST_F(PortableFileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + DocumentProto document3 = + DocumentBuilder().SetKey("namespace", "uri3").Build(); + DocumentProto document4 = + DocumentBuilder().SetKey("namespace", "uri4").Build(); + + int64_t document2_offset; + int64_t document3_offset; + + { + // Erase data after the rewind position. This won't update the checksum + // immediately. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes 3 protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK_AND_ASSIGN(document2_offset, + proto_log->WriteProto(document2)); + ICING_ASSERT_OK_AND_ASSIGN(document3_offset, + proto_log->WriteProto(document3)); + + // Erases the 1st proto, checksum won't be updated immediately because the + // rewind position is 0. + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(2175574628)))); + } // New checksum is updated in destructor. + + { + // Erase data before the rewind position. This will update the checksum + // immediately. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Erases the 2nd proto that is now before the rewind position. Checksum + // is updated. + ICING_ASSERT_OK(proto_log->EraseProto(document2_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(790877774)))); + } + + { + // Append data and erase data before the rewind position. This will update + // the checksum twice: in EraseProto() and destructor. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Append a new document which is after the rewind position. + ICING_ASSERT_OK(proto_log->WriteProto(document4)); + + // Erases the 3rd proto that is now before the rewind position. Checksum + // is updated. + ICING_ASSERT_OK(proto_log->EraseProto(document3_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(2344803210)))); + } // Checksum is updated with the newly appended document. + + { + // A successful creation means that the checksum matches. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + } +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index a281f22..c1de0f0 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -97,8 +97,11 @@ constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = + StringIndexingConfig_TokenizerType_Code_NONE; constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN; // For mocking purpose, we allow tests to provide a custom Filesystem. class TestIcingSearchEngine : public IcingSearchEngine { @@ -5726,6 +5729,88 @@ TEST_F(IcingSearchEngineTest, RestoreIndexLoseIndex) { } } +TEST_F(IcingSearchEngineTest, + DocumentWithNoIndexedContentDoesntCauseRestoreIndex) { + // 1. Create an index with a single document in it that has no indexed + // content. + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Set a schema for a single type that has no indexed properties. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("unindexedField") + .SetDataTypeString(MATCH_NONE, TOKENIZER_NONE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + // Add a document that contains no indexed content. + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("unindexedField", + "Don't you dare search over this!") + .Build(); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + + // 2. Create the index again. This should NOT trigger a recovery of any kind. + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + } +} + +TEST_F(IcingSearchEngineTest, + DocumentWithNoValidIndexedContentDoesntCauseRestoreIndex) { + // 1. Create an index with a single document in it that has no valid indexed + // tokens in its content. + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Set a schema for a single type that has no indexed properties. + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // Add a document that contains no valid indexed content - just punctuation. + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", "?...!") + .Build(); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + + // 2. Create the index again. This should NOT trigger a recovery of any kind. + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + } +} + TEST_F(IcingSearchEngineTest, IndexingDocMergeFailureResets) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/0") diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc index 2a5a0d9..887e6e4 100644 --- a/icing/index/hit/hit.cc +++ b/icing/index/hit/hit.cc @@ -67,9 +67,10 @@ Hit::Hit(SectionId section_id, DocumentId document_id, &temp_value); bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &temp_value); bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency, - kHasTermFrequency, 1, &temp_value); - bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, 1, &temp_value); - bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, 1, &temp_value); + kHasTermFrequency, /*len=*/1, &temp_value); + bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, /*len=*/1, &temp_value); + bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, + /*len=*/1, &temp_value); value_ = temp_value; } diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index 09dda41..6d8632f 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -64,6 +64,7 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( "DocumentId %d must be greater than last added document_id %d", document_id, index_->last_added_document_id())); } + index_->set_last_added_document_id(document_id); uint32_t num_tokens = 0; libtextclassifier3::Status overall_status; for (const TokenizedSection& section : tokenized_document.sections()) { diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index fc14800..8a6a9f5 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -261,7 +261,23 @@ TEST_F(IndexProcessorTest, NoTermMatchTypeContent) { document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); - EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); +} + +TEST_F(IndexProcessorTest, NoValidContent) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kExactProperty), "?...!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexProcessorTest, OneDoc) { diff --git a/icing/index/index.h b/icing/index/index.h index b7021ca..eab5be8 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -127,6 +127,16 @@ class Index { return main_index_->last_added_document_id(); } + // Sets last_added_document_id to document_id so long as document_id > + // last_added_document_id() + void set_last_added_document_id(DocumentId document_id) { + DocumentId lite_document_id = lite_index_->last_added_document_id(); + if (lite_document_id == kInvalidDocumentId || + document_id >= lite_document_id) { + lite_index_->set_last_added_document_id(document_id); + } + } + // Returns debug information for the index in out. // verbosity <= 0, simplest debug information - just the lexicons and lite // index. diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index de4edf8..16593ef 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -153,8 +153,6 @@ TEST_F(IndexTest, EmptyIndex) { index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(itr->Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); } TEST_F(IndexTest, EmptyIndexAfterMerge) { @@ -172,8 +170,6 @@ TEST_F(IndexTest, EmptyIndexAfterMerge) { index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(itr->Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); } TEST_F(IndexTest, AdvancePastEnd) { @@ -238,8 +234,6 @@ TEST_F(IndexTest, SingleHitSingleTermIndex) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) { @@ -256,8 +250,6 @@ TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, SingleHitMultiTermIndex) { @@ -273,8 +265,6 @@ TEST_F(IndexTest, SingleHitMultiTermIndex) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) { @@ -292,8 +282,6 @@ TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, NoHitMultiTermIndex) { @@ -308,7 +296,6 @@ TEST_F(IndexTest, NoHitMultiTermIndex) { index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(itr->Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) { @@ -325,7 +312,6 @@ TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) { index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(itr->Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, MultiHitMultiTermIndex) { @@ -352,7 +338,6 @@ TEST_F(IndexTest, MultiHitMultiTermIndex) { ElementsAre( EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) { @@ -381,7 +366,6 @@ TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) { ElementsAre( EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, MultiHitSectionRestrict) { @@ -402,8 +386,6 @@ TEST_F(IndexTest, MultiHitSectionRestrict) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, MultiHitSectionRestrictAfterMerge) { @@ -426,8 +408,6 @@ TEST_F(IndexTest, MultiHitSectionRestrictAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, SingleHitDedupeIndex) { @@ -449,8 +429,6 @@ TEST_F(IndexTest, SingleHitDedupeIndex) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, PrefixHit) { @@ -465,8 +443,6 @@ TEST_F(IndexTest, PrefixHit) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, PrefixHitAfterMerge) { @@ -483,8 +459,6 @@ TEST_F(IndexTest, PrefixHitAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, MultiPrefixHit) { @@ -506,8 +480,6 @@ TEST_F(IndexTest, MultiPrefixHit) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, MultiPrefixHitAfterMerge) { @@ -531,8 +503,6 @@ TEST_F(IndexTest, MultiPrefixHitAfterMerge) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, NoExactHitInPrefixQuery) { @@ -552,7 +522,6 @@ TEST_F(IndexTest, NoExactHitInPrefixQuery) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId1, std::vector<SectionId>{kSectionId3}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) { @@ -574,7 +543,6 @@ TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId1, std::vector<SectionId>{kSectionId3}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, PrefixHitDedupe) { @@ -590,7 +558,6 @@ TEST_F(IndexTest, PrefixHitDedupe) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, PrefixHitDedupeAfterMerge) { @@ -608,7 +575,6 @@ TEST_F(IndexTest, PrefixHitDedupeAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, PrefixToString) { @@ -705,9 +671,11 @@ TEST_F(IndexTest, FullIndex) { std::default_random_engine random; std::vector<std::string> query_terms; + std::string prefix = "prefix"; for (int i = 0; i < 2600; ++i) { constexpr int kTokenSize = 5; - query_terms.push_back(RandomString(kAlNumAlphabet, kTokenSize, &random)); + query_terms.push_back(prefix + + RandomString(kAlNumAlphabet, kTokenSize, &random)); } DocumentId document_id = 0; @@ -716,7 +684,7 @@ TEST_F(IndexTest, FullIndex) { while (status.ok()) { for (int i = 0; i < 100; ++i) { Index::Editor edit = - index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY, + index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); size_t idx = uniform(random); status = edit.BufferTerm(query_terms.at(idx).c_str()); @@ -733,11 +701,14 @@ TEST_F(IndexTest, FullIndex) { // Adding more hits should fail. Index::Editor edit = - index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, + index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); - EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + std::string term = prefix + "foo"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "bar"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "baz"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); @@ -745,12 +716,17 @@ TEST_F(IndexTest, FullIndex) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, index_->GetIterator(query_terms.at(i).c_str(), kSectionIdMaskAll, - TermMatchType::EXACT_ONLY)); + TermMatchType::PREFIX)); // Each query term should contain at least one hit - there may have been // other hits for this term that were added. EXPECT_THAT(itr->Advance(), IsOk()); } - EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> last_itr, + index_->GetIterator(prefix.c_str(), kSectionIdMaskAll, + TermMatchType::PREFIX)); + EXPECT_THAT(last_itr->Advance(), IsOk()); + EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1)); } TEST_F(IndexTest, FullIndexMerge) { @@ -761,9 +737,11 @@ TEST_F(IndexTest, FullIndexMerge) { std::default_random_engine random; std::vector<std::string> query_terms; + std::string prefix = "prefix"; for (int i = 0; i < 2600; ++i) { constexpr int kTokenSize = 5; - query_terms.push_back(RandomString(kAlNumAlphabet, kTokenSize, &random)); + query_terms.push_back(prefix + + RandomString(kAlNumAlphabet, kTokenSize, &random)); } DocumentId document_id = 0; @@ -772,7 +750,7 @@ TEST_F(IndexTest, FullIndexMerge) { while (status.ok()) { for (int i = 0; i < 100; ++i) { Index::Editor edit = - index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY, + index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); size_t idx = uniform(random); status = edit.BufferTerm(query_terms.at(idx).c_str()); @@ -791,30 +769,45 @@ TEST_F(IndexTest, FullIndexMerge) { // Adding more hits should fail. Index::Editor edit = - index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, + index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); - EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + std::string term = prefix + "foo"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "bar"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "baz"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> last_itr, + index_->GetIterator(prefix.c_str(), kSectionIdMaskAll, + TermMatchType::PREFIX)); + EXPECT_THAT(last_itr->Advance(), IsOk()); + EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1)); // After merging with the main index. Adding more hits should succeed now. ICING_ASSERT_OK(index_->Merge()); - edit = - index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, 0); - EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); - EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + edit = index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, 0); + prefix + "foo"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "bar"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "baz"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, - index_->GetIterator("bar", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); + index_->GetIterator(prefix + "bar", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); // We know that "bar" should have at least one hit because we just added it! EXPECT_THAT(itr->Advance(), IsOk()); EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(document_id + 1)); - EXPECT_THAT(index_->last_added_document_id(), Eq(document_id + 1)); + ICING_ASSERT_OK_AND_ASSIGN( + last_itr, index_->GetIterator(prefix.c_str(), kSectionIdMaskAll, + TermMatchType::PREFIX)); + EXPECT_THAT(last_itr->Advance(), IsOk()); + EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id + 1)); } TEST_F(IndexTest, IndexCreateIOFailure) { @@ -883,8 +876,6 @@ TEST_F(IndexTest, IndexPersistence) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, IndexPersistenceAfterMerge) { @@ -912,8 +903,6 @@ TEST_F(IndexTest, IndexPersistenceAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, InvalidHitBufferSize) { @@ -1280,8 +1269,6 @@ TEST_F(IndexTest, ExactResultsFromLiteAndMain) { ElementsAre( EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, PrefixResultsFromLiteAndMain) { @@ -1314,8 +1301,6 @@ TEST_F(IndexTest, PrefixResultsFromLiteAndMain) { EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, GetDebugInfo) { @@ -1422,8 +1407,6 @@ TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId3}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, BackfillingNewTermsSucceeds) { @@ -1478,8 +1461,6 @@ TEST_F(IndexTest, BackfillingNewTermsSucceeds) { ElementsAre( EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId3)); } TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) { @@ -1527,8 +1508,6 @@ TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { @@ -1544,6 +1523,7 @@ TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { TermMatchType::PREFIX, /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId0); ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id())); // Clipping to invalid should have no effect. ICING_ASSERT_OK_AND_ASSIGN( @@ -1565,6 +1545,7 @@ TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId1); // Clipping to invalid should still have no effect even if both indices have // hits. @@ -1576,8 +1557,6 @@ TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { @@ -1586,6 +1565,7 @@ TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { TermMatchType::PREFIX, /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId0); ICING_ASSERT_OK(index_->Merge()); @@ -1594,6 +1574,7 @@ TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId1); EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk()); @@ -1604,8 +1585,6 @@ TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { @@ -1614,10 +1593,12 @@ TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { TermMatchType::PREFIX, /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId0); edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foul"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId1); ICING_ASSERT_OK(index_->Merge()); @@ -1626,6 +1607,7 @@ TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId2); EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk()); @@ -1634,8 +1616,6 @@ TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { std::unique_ptr<DocHitInfoIterator> itr, index_->GetIterator("f", kSectionIdMaskAll, TermMatchType::PREFIX)); EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); } TEST_F(IndexTest, IndexStorageInfoProto) { diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index 69138e1..fb23934 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -310,8 +310,6 @@ libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) { return absl_ports::ResourceExhaustedError("Hit buffer is full!"); } - header_->set_last_added_docid(hit.document_id()); - TermIdHitPair term_id_hit_pair(term_id, hit); uint32_t cur_size = header_->cur_size(); TermIdHitPair::Value* valp = diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 90c6fbc..b134aba 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -225,6 +225,9 @@ class LiteIndex { DocumentId last_added_document_id() const { return header_->last_added_docid(); } + void set_last_added_document_id(DocumentId document_id) const { + header_->set_last_added_docid(document_id); + } const IcingDynamicTrie& lexicon() const { return lexicon_; } diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc index 58eb8bf..9b75db6 100644 --- a/icing/jni/jni-cache.cc +++ b/icing/jni/jni-cache.cc @@ -14,6 +14,8 @@ #include "icing/jni/jni-cache.h" +#ifdef ICING_REVERSE_JNI_SEGMENTATION + #include "icing/text_classifier/lib3/utils/java/jni-base.h" #include "icing/text_classifier/lib3/utils/java/jni-helper.h" #include "icing/absl_ports/canonical_errors.h" @@ -214,3 +216,5 @@ JniCache::ConvertToJavaString(const char* utf8_text, } // namespace lib } // namespace icing + +#endif // ICING_REVERSE_JNI_SEGMENTATION diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h index a5f16c7..3faaed6 100644 --- a/icing/jni/jni-cache.h +++ b/icing/jni/jni-cache.h @@ -15,6 +15,16 @@ #ifndef ICING_JNI_JNI_CACHE_H_ #define ICING_JNI_JNI_CACHE_H_ +#ifndef ICING_REVERSE_JNI_SEGMENTATION +namespace icing { +namespace lib { + +class JniCache {}; // Declare an empty class definition for non-Android builds. + +} // namespace lib +} // namespace icing +#else // ICING_REVERSE_JNI_SEGMENTATION + #include <jni.h> #include "icing/text_classifier/lib3/utils/base/statusor.h" @@ -75,4 +85,6 @@ struct JniCache { } // namespace lib } // namespace icing +#endif // !ICING_REVERSE_JNI_SEGMENTATION + #endif // ICING_JNI_JNI_CACHE_H_ diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc index fe89f47..a4734b4 100644 --- a/icing/scoring/scorer.cc +++ b/icing/scoring/scorer.cc @@ -89,6 +89,7 @@ class RelevanceScoreScorer : public Scorer { if (!query_it) { return default_score_; } + return static_cast<double>( bm25f_calculator_->ComputeScore(query_it, hit_info, default_score_)); } diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc index 22d548a..8b89514 100644 --- a/icing/scoring/scorer_test.cc +++ b/icing/scoring/scorer_test.cc @@ -95,6 +95,10 @@ class ScorerTest : public testing::Test { const FakeClock& fake_clock2() { return fake_clock2_; } + void SetFakeClock1Time(int64_t new_time) { + fake_clock1_.SetSystemTimeMilliseconds(new_time); + } + private: const std::string test_dir_; const std::string doc_store_dir_; @@ -123,7 +127,7 @@ TEST_F(ScorerTest, CreationWithNullPointerShouldFail) { StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } -TEST_F(ScorerTest, ShouldGetDefaultScore) { +TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentDoesntExist) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, @@ -135,6 +139,66 @@ TEST_F(ScorerTest, ShouldGetDefaultScore) { EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10)); } +TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsDeleted) { + // Creates a test document with a provided score + DocumentProto test_document = DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .SetScore(42) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store()->Put(test_document)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Scorer> scorer, + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/10, document_store())); + + DocHitInfo docHitInfo = DocHitInfo(document_id); + + // The document's score is returned + EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(42)); + + // Delete the document and check that the caller-provided default score is + // returned + EXPECT_THAT(document_store()->Delete(document_id), IsOk()); + EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10)); +} + +TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsExpired) { + // Creates a test document with a provided score + int64_t creation_time = fake_clock1().GetSystemTimeMilliseconds(); + int64_t ttl = 100; + DocumentProto test_document = DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .SetScore(42) + .SetCreationTimestampMs(creation_time) + .SetTtlMs(ttl) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store()->Put(test_document)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Scorer> scorer, + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/10, document_store())); + + DocHitInfo docHitInfo = DocHitInfo(document_id); + + // The document's score is returned since the document hasn't expired yet. + EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(42)); + + // Expire the document and check that the caller-provided default score is + // returned + SetFakeClock1Time(creation_time + ttl + 10); + EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10)); +} + TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) { // Creates a test document with the default document score 0 DocumentProto test_document = diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index d79c861..5f478fa 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -1068,6 +1068,11 @@ libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId( libtextclassifier3::StatusOr<DocumentAssociatedScoreData> DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const { + if (!DoesDocumentExist(document_id)) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Can't get usage scores, document id '%d' doesn't exist", document_id)); + } + auto score_data_or = score_cache_->GetCopy(document_id); if (!score_data_or.ok()) { ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id @@ -1131,6 +1136,10 @@ DocumentStore::GetDocumentFilterData(DocumentId document_id) const { libtextclassifier3::StatusOr<UsageStore::UsageScores> DocumentStore::GetUsageScores(DocumentId document_id) const { + if (!DoesDocumentExist(document_id)) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Can't get usage scores, document id '%d' doesn't exist", document_id)); + } return usage_store_->GetUsageScores(document_id); } @@ -1139,6 +1148,17 @@ libtextclassifier3::Status DocumentStore::ReportUsage( ICING_ASSIGN_OR_RETURN(DocumentId document_id, GetDocumentId(usage_report.document_namespace(), usage_report.document_uri())); + // We can use the internal version here because we got our document_id from + // our internal data structures. We would have thrown some error if the + // namespace and/or uri were incorrect. + if (!InternalDoesDocumentExist(document_id)) { + // Document was probably deleted or expired. + return absl_ports::NotFoundError(absl_ports::StrCat( + "Couldn't report usage on a nonexistent document: (namespace: '", + usage_report.document_namespace(), "', uri: '", + usage_report.document_uri(), "')")); + } + return usage_store_->AddUsageReport(usage_report, document_id); } @@ -1587,6 +1607,7 @@ libtextclassifier3::Status DocumentStore::OptimizeInto( // Copy over usage scores. ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores, usage_store_->GetUsageScores(document_id)); + DocumentId new_document_id = new_document_id_or.ValueOrDie(); ICING_RETURN_IF_ERROR( new_doc_store->SetUsageScores(new_document_id, usage_scores)); diff --git a/icing/store/document-store.h b/icing/store/document-store.h index a8d87c8..9e1b3ec 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -256,16 +256,9 @@ class DocumentStore { // Returns the DocumentAssociatedScoreData of the document specified by the // DocumentId. // - // NOTE: This does not check if the document exists and will return the - // DocumentFilterData of the document even if it has been deleted. Users - // should check DoesDocumentExist(document_id) if they only want existing - // documents' DocumentFilterData. - // // Returns: // DocumentAssociatedScoreData on success - // OUT_OF_RANGE if document_id is negative or exceeds previously seen - // DocumentIds - // NOT_FOUND if no score data is found + // NOT_FOUND if the document or the score data is not found libtextclassifier3::StatusOr<DocumentAssociatedScoreData> GetDocumentAssociatedScoreData(DocumentId document_id) const; @@ -302,8 +295,8 @@ class DocumentStore { // // Returns: // UsageScores on success + // NOT_FOUND if document_id no longer exists. // INVALID_ARGUMENT if document_id is invalid - // INTERNAL_ERROR on I/O errors libtextclassifier3::StatusOr<UsageStore::UsageScores> GetUsageScores( DocumentId document_id) const; diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index ebc5ec3..b37c6de 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -1639,7 +1639,7 @@ TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataDifferentCorpus) { /*length_in_tokens=*/7))); } -TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataOutOfRange) { +TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1648,7 +1648,7 @@ TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataOutOfRange) { std::move(create_result.document_store); EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(/*document_id=*/0), - StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } TEST_F(DocumentStoreTest, DeleteClearsFilterCache) { @@ -1699,7 +1699,7 @@ TEST_F(DocumentStoreTest, DeleteClearsScoreCache) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteShouldClearUsageScores) { +TEST_F(DocumentStoreTest, DeleteShouldPreventUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1724,10 +1724,63 @@ TEST_F(DocumentStoreTest, DeleteShouldClearUsageScores) { // Delete the document. ICING_ASSERT_OK(doc_store->Delete("icing", "email/1")); - // The scores should be cleared. - expected_scores.usage_type1_count = 0; + // Can't report or get usage scores on the deleted document + ASSERT_THAT( + doc_store->ReportUsage(usage_report_type1), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, + HasSubstr("Couldn't report usage on a nonexistent document"))); + + ASSERT_THAT(doc_store->GetUsageScores(document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, + HasSubstr("Can't get usage scores"))); +} + +TEST_F(DocumentStoreTest, ExpirationShouldPreventUsageScores) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentProto document = DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .AddStringProperty("body", "body bar") + .SetScore(document1_score_) + .SetCreationTimestampMs(10) + .SetTtlMs(100) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document)); + + // Some arbitrary time before the document's creation time (10) + ttl (100) + fake_clock_.SetSystemTimeMilliseconds(109); + + // Report usage with type 1. + UsageReport usage_report_type1 = CreateUsageReport( + /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0, + UsageReport::USAGE_TYPE1); + ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type1)); + + UsageStore::UsageScores expected_scores; + expected_scores.usage_type1_count = 1; ASSERT_THAT(doc_store->GetUsageScores(document_id), IsOkAndHolds(expected_scores)); + + // Some arbitrary time past the document's creation time (10) + ttl (100) + fake_clock_.SetSystemTimeMilliseconds(200); + + // Can't report or get usage scores on the expired document + ASSERT_THAT( + doc_store->ReportUsage(usage_report_type1), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, + HasSubstr("Couldn't report usage on a nonexistent document"))); + + ASSERT_THAT(doc_store->GetUsageScores(document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, + HasSubstr("Can't get usage scores"))); } TEST_F(DocumentStoreTest, diff --git a/icing/testing/jni-test-helpers.h b/icing/testing/jni-test-helpers.h index adc469a..67a98c3 100644 --- a/icing/testing/jni-test-helpers.h +++ b/icing/testing/jni-test-helpers.h @@ -15,6 +15,8 @@ #ifndef ICING_TESTING_JNI_TEST_HELPERS_H_ #define ICING_TESTING_JNI_TEST_HELPERS_H_ +#include <memory> + #include "icing/jni/jni-cache.h" #ifdef ICING_REVERSE_JNI_SEGMENTATION diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h index e60c168..cae3eee 100644 --- a/icing/tokenization/language-segmenter-factory.h +++ b/icing/tokenization/language-segmenter-factory.h @@ -18,11 +18,7 @@ #include <memory> #include <string_view> -#ifdef __ANDROID__ #include "icing/jni/jni-cache.h" -#else // __ANDROID__ -class JniCache; // forward declaration to let non-Android builds work. -#endif // __ANDROID__ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/tokenization/language-segmenter.h" diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc index 8392363..5f5202c 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc @@ -21,12 +21,12 @@ JNIEnv* g_jenv = nullptr; extern "C" JNIEXPORT jboolean JNICALL -Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain( - JNIEnv* env, jclass ignored) { +Java_icing_jni_ReverseJniLanguageSegmenterJniTest_testsMain(JNIEnv* env, + jclass ignored) { g_jenv = env; std::vector<char*> my_argv; - char arg[] = "reverse-jni-language-segmenter-test-lib"; + char arg[] = "jni-test-lib"; my_argv.push_back(arg); int argc = 1; char** argv = &(my_argv[0]); diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h deleted file mode 100644 index 64b68ec..0000000 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ -#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ - -#include <jni.h> - -#include "icing/jni/jni-cache.h" -#include "gtest/gtest.h" - -extern JNIEnv* g_jenv; - -namespace icing { -namespace lib { - -namespace test_internal { - -class ReverseJniLanguageSegmenterTest - : public testing::TestWithParam<const char*> { - protected: - ReverseJniLanguageSegmenterTest() - : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {} - - static std::string GetLocale() { return GetParam(); } - - std::unique_ptr<JniCache> jni_cache_; -}; - -} // namespace test_internal - -} // namespace lib -} // namespace icing - -#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc index 2c268ff..72c3180 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc @@ -12,17 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h" +#include <jni.h> #include <memory> #include <string_view> +#include "icing/jni/jni-cache.h" #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "gmock/gmock.h" #include "icing/absl_ports/str_cat.h" #include "icing/testing/common-matchers.h" #include "icing/testing/icu-i18n-test-utils.h" +#include "icing/testing/jni-test-helpers.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" #include "unicode/uloc.h" @@ -120,6 +122,14 @@ std::vector<std::string_view> GetAllTermsResetBefore( return terms; } +class ReverseJniLanguageSegmenterTest + : public testing::TestWithParam<const char*> { + protected: + static std::string GetLocale() { return GetParam(); } + + std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache(); +}; + } // namespace TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) { diff --git a/icing/tools/document-store-dump.cc b/icing/tools/document-store-dump.cc deleted file mode 100644 index 45c9bf5..0000000 --- a/icing/tools/document-store-dump.cc +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/tools/document-store-dump.h" - -#include <cinttypes> - -#include "icing/absl_ports/str_cat.h" -#include "icing/legacy/core/icing-string-util.h" -#include "icing/util/logging.h" - -namespace icing { -namespace lib { -namespace { - -void AppendDocumentProto(DocId document_id, const Document& doc, - std::string* output) { - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - "Document {\n document_id: %d\n corpus_id: %d\n uri: " - "'%s'\n score: %d\n created_timestamp_ms: %" PRIu64 "\n", - static_cast<int>(document_id), doc.corpus_id(), - doc.uri().c_str(), static_cast<int>(doc.score()), - static_cast<int64_t>(doc.created_timestamp_ms()))); - for (const auto& section : doc.sections()) { - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - " section {\n id: %d\n indexed_length: " - "%d\n content: '%s'\n snippet: '%s'\n", - static_cast<int>(section.id()), - static_cast<int>(section.indexed_length()), - section.content().c_str(), section.snippet().c_str())); - for (int64_t extracted_number : section.extracted_numbers()) { - absl_ports::StrAppend(output, IcingStringUtil::StringPrintf( - " extracted_numbers: %" PRId64 "\n", - extracted_number)); - } - for (const std::string& annotation_token : section.annotation_tokens()) { - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf(" annotation_tokens: '%s'\n", - annotation_token.c_str())); - } - std::string indexed = (section.config().indexed()) ? "true" : "false"; - std::string index_prefixes = - (section.config().index_prefixes()) ? "true" : "false"; - absl_ports::StrAppend( - output, - IcingStringUtil::StringPrintf( - " config {\n name: '%s'\n indexed: %s\n " - "tokenizer: %d\n weight: %d\n index_prefixes: %s\n " - "subsection_separator: '%s'\n", - section.config().name().c_str(), indexed.c_str(), - section.config().tokenizer(), - static_cast<int>(section.config().weight()), index_prefixes.c_str(), - section.config().subsection_separator().c_str())); - for (const auto& variant_generator : - section.config().variant_generators()) { - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - " variant_generators: %d\n", variant_generator)); - } - absl_ports::StrAppend( - output, - IcingStringUtil::StringPrintf( - " common_term_legacy_hit_score: %d\n " - "rfc822_host_name_term_legacy_hit_score: %d\n " - "semantic_property: '%s'\n universal_section_id: %d\n " - "omnibox_section_type: %d\n st_section_type: %d\n }\n }\n", - section.config().common_term_legacy_hit_score(), - section.config().rfc822_host_name_term_legacy_hit_score(), - section.config().semantic_property().c_str(), - section.config().universal_section_id(), - section.config().omnibox_section_type(), - section.config().st_section_type())); - } - for (const auto& language : doc.languages()) { - std::string used_classifier = - (language.used_classifier()) ? "true" : "false"; - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - " languages {\n language: %d\n score: %d\n " - "used_classifier: %s\n }\n", - language.language(), static_cast<int>(language.score()), - used_classifier.c_str())); - } - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - " ANNOTATIONS PRINTING NOT IMPLEMENTED YET IN ICING-TOOL\n")); -} - -} // namespace - -std::string GetDocumentStoreDump(const DocumentStore& document_store) { - std::string output; - for (DocId document_id = 0; document_id < document_store.num_documents(); - document_id++) { - Document doc; - if (!document_store.ReadDocument(document_id, &doc)) { - ICING_LOG(FATAL) << "Failed to read document"; - } - - AppendDocumentProto(document_id, doc, &output); - } - return output; -} - -} // namespace lib -} // namespace icing diff --git a/icing/tools/document-store-dump.h b/icing/tools/document-store-dump.h deleted file mode 100644 index 023b301..0000000 --- a/icing/tools/document-store-dump.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TOOLS_DOCUMENT_STORE_DUMP_H_ -#define ICING_TOOLS_DOCUMENT_STORE_DUMP_H_ - -#include <string> - -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h" - -namespace icing { -namespace lib { - -// Utility function for dumping the complete document store content. -// This provides a human-readable representation of the document store, mainly -// provided for easier understandability for developers. -// The output of this class should only be available on cmdline-tool-level -// (with root access), or unit tests. In other words it should not be possible -// to trigger this on a release key device, for data protection reasons. -std::string GetDocumentStoreDump(const DocumentStore& document_store); - -} // namespace lib -} // namespace icing -#endif // ICING_TOOLS_DOCUMENT_STORE_DUMP_H_ diff --git a/icing/tools/icing-tool.cc b/icing/tools/icing-tool.cc deleted file mode 100644 index 72a11e9..0000000 --- a/icing/tools/icing-tool.cc +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright 2012 Google Inc. All Rights Reserved. -// Author: ulas@google.com (Ulas Kirazci) -// -// A tool to debug the native index. - -#include <getopt.h> -#include <unistd.h> - -#include <string> - -#include "java/com/google/android/gmscore/integ/modules/icing/jni/core/string-util.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/doc-property-filter.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/dynamic-trie.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/filesystem.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/mobstore.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/native-index-impl.h" -#include "icing/absl_ports/str_cat.h" -#include "icing/legacy/core/icing-string-util.h" -#include "icing/tools/document-store-dump.h" -#include "icing/util/logging.h" - -using std::vector; -using ::wireless_android_play_playlog::icing::IndexRestorationStats; - -namespace icing { -namespace lib { - -// 256KB for debugging. -const size_t kMaxDocumentSizeForDebugging = 1u << 18; -// Dump dynamic trie stats and contents. -void ProcessDynamicTrie(const char* filename) { - Filesystem filesystem; - DynamicTrie trie(filename, DynamicTrie::RuntimeOptions(), &filesystem); - if (!trie.Init()) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Opening trie %s failed", - filename); - return; - } - - std::string out; - trie.GetDebugInfo(true, &out); - printf("Stats:\n%s", out.c_str()); - - std::ostringstream contents; - vector<std::string> keys; - trie.DumpTrie(&contents, &keys); - printf("Contents:\n%s", contents.str().c_str()); -} - -NativeIndexImpl* MakeIndex(const char* root_dir) { - NativeConfig native_config; - native_config.set_max_document_size(kMaxDocumentSizeForDebugging); - FlashIndexOptions flash_index_options( - NativeIndexImpl::GetNativeIndexDir(root_dir)); - NativeIndexImpl* ni = - new NativeIndexImpl(root_dir, native_config, flash_index_options); - InitStatus init_status; - if (!ni->Init(&init_status)) { - ICING_LOG(FATAL) << "Failed to initialize legacy native index impl"; - } - - IndexRestorationStats unused; - ni->RestoreIndex(IndexRequestSpec::default_instance(), &unused); - return ni; -} - -void RunQuery(NativeIndexImpl* ni, const std::string& query, int start, - int num_results) { - // Pull out corpusids and uris. - QueryRequestSpec spec; - spec.set_no_corpus_filter(true); - spec.set_want_uris(true); - spec.set_scoring_verbosity_level(1); - spec.set_prefix_match(true); - - QueryResponse response; - ni->ExecuteQuery(query, spec, 10000, start, num_results, &response); - - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Query [%s] num results %u", query.c_str(), response.num_results()); - - for (int i = 0, uri_offset = 0; i < response.num_results(); i++) { - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "%d: (cid=%u) uri %.*s", i, response.corpus_ids(i), - response.uri_lengths(i), response.uri_buffer().data() + uri_offset); - uri_offset += response.uri_lengths(i); - } -} - -void RunSuggest(NativeIndexImpl* ni, const std::string& prefix, - int num_results) { - SuggestionResponse results; - ni->Suggest(prefix, num_results, vector<CorpusId>(), &results); - - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Query [%s] num results %zu", prefix.c_str(), - static_cast<size_t>(results.suggestions_size())); - - for (size_t i = 0; i < results.suggestions_size(); i++) { - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Sugg: [%s] display text [%s]", results.suggestions(i).query().c_str(), - results.suggestions(i).display_text().c_str()); - } -} - -int IcingTool(int argc, char** argv) { - auto file_storage = CreatePosixFileStorage(); - enum Options { - OPT_FILENAME, - OPT_OP, - OPT_QUERY, - NUM_OPT, - }; - static const option kOptions[NUM_OPT + 1] = { - {"filename", 1, nullptr, 0}, - {"op", 1, nullptr, 0}, - {"query", 1, nullptr, 0}, - {nullptr, 0, nullptr, 0}, - }; - const char* opt_values[NUM_OPT]; - memset(opt_values, 0, sizeof(opt_values)); - - while (true) { - int opt_idx = -1; - int ret = getopt_long(argc, argv, "", kOptions, &opt_idx); - if (ret != 0) break; - - if (opt_idx >= 0 && opt_idx < NUM_OPT) { - opt_values[opt_idx] = optarg; - } - } - - if (!opt_values[OPT_OP]) { - ICING_LOG(ERROR) << "No op specified"; - return -1; - } - - if (!opt_values[OPT_FILENAME]) { - ICING_LOG(ERROR) << "No filename specified"; - return -1; - } - if (!strncmp( - opt_values[OPT_FILENAME], - "/data/data/com.google.android.gms/files/AppDataSearch", - strlen("/data/data/com.google.android.gms/files/AppDataSearch"))) { - ICING_LOG(ERROR) - << "Should not read directly from the file in gmscore - " - "icing-tool also commits writes as side-effects which corrupts " - "the index on concurrent modification"; - return -1; - } - - const char* op = opt_values[OPT_OP]; - DocumentStore::Options options(file_storage.get(), - kMaxDocumentSizeForDebugging); - if (!strcmp(op, "dyntrie")) { - std::string full_file_path = - absl_ports::StrCat(opt_values[OPT_FILENAME], "/idx.lexicon"); - ProcessDynamicTrie(full_file_path.c_str()); - } else if (!strcmp(op, "verify")) { - std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME])); - ni->CheckVerify(); - } else if (!strcmp(op, "query")) { - if (opt_values[OPT_QUERY] == nullptr) { - ICING_LOG(FATAL) << "Opt value is null"; - } - - std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME])); - RunQuery(ni.get(), opt_values[OPT_QUERY], 0, 100); - } else if (!strcmp(op, "suggest")) { - if (opt_values[OPT_QUERY] == nullptr) { - ICING_LOG(FATAL) << "Opt value is null"; - } - - std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME])); - RunSuggest(ni.get(), opt_values[OPT_QUERY], 100); - } else if (!strcmp(op, "dump-all-docs")) { - DocumentStore ds(opt_values[OPT_FILENAME], options); - if (!ds.Init()) { - ICING_LOG(FATAL) << "Legacy document store failed to initialize"; - } - - printf( - "------ Document Store Dump Start ------\n" - "%s\n" - "------ Document Store Dump End ------\n", - GetDocumentStoreDump(ds).c_str()); - } else if (!strcmp(op, "dump-uris")) { - CorpusId corpus_id = kInvalidCorpusId; - if (opt_values[OPT_QUERY]) { - // Query is corpus id. - corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT - } - DocumentStore ds(opt_values[OPT_FILENAME], options); - if (!ds.Init()) { - ICING_LOG(FATAL) << "Legacy document store failed to initialize"; - } - - DocPropertyFilter dpf; - ds.AddDeletedTagFilter(&dpf); - - // Dump with format "<corpusid> <uri> <tagname>*". - int filtered = 0; - vector<std::string> tagnames; - for (DocId document_id = 0; document_id < ds.num_documents(); - document_id++) { - Document doc; - if (!ds.ReadDocument(document_id, &doc)) { - ICING_LOG(FATAL) << "Failed to read document."; - } - - if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) { - filtered++; - continue; - } - if (dpf.Match(0, document_id)) { - filtered++; - continue; - } - - tagnames.clear(); - ds.GetAllSetUserTagNames(document_id, &tagnames); - - printf("%d %s %s\n", doc.corpus_id(), doc.uri().c_str(), - StringUtil::JoinStrings("/", tagnames).c_str()); - } - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Processed %u filtered %d", ds.num_documents(), filtered); - } else if (!strcmp(op, "dump-docs")) { - std::string out_filename = opt_values[OPT_FILENAME]; - out_filename.append("/docs-dump"); - CorpusId corpus_id = kInvalidCorpusId; - if (opt_values[OPT_QUERY]) { - // Query is corpus id. - corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT - out_filename.push_back('.'); - out_filename.append(opt_values[OPT_QUERY]); - } - DocumentStore ds(opt_values[OPT_FILENAME], options); - if (!ds.Init()) { - ICING_LOG(FATAL) << "Legacy document store failed to initialize"; - } - - DocPropertyFilter dpf; - ds.AddDeletedTagFilter(&dpf); - - // Dump with format (<32-bit length><serialized content>)*. - FILE* fp = fopen(out_filename.c_str(), "w"); - int filtered = 0; - for (DocId document_id = 0; document_id < ds.num_documents(); - document_id++) { - Document doc; - if (!ds.ReadDocument(document_id, &doc)) { - ICING_LOG(FATAL) << "Failed to read document."; - } - - if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) { - filtered++; - continue; - } - if (dpf.Match(0, document_id)) { - filtered++; - continue; - } - - std::string serialized = doc.SerializeAsString(); - uint32_t length = serialized.size(); - if (fwrite(&length, 1, sizeof(length), fp) != sizeof(length)) { - ICING_LOG(FATAL) << "Failed to write length information to file"; - } - - if (fwrite(serialized.data(), 1, serialized.size(), fp) != - serialized.size()) { - ICING_LOG(FATAL) << "Failed to write document to file"; - } - } - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Processed %u filtered %d", ds.num_documents(), filtered); - fclose(fp); - } else { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unknown op %s", op); - return -1; - } - - return 0; -} - -} // namespace lib -} // namespace icing - -int main(int argc, char** argv) { return icing::lib::IcingTool(argc, argv); } diff --git a/icing/util/bit-util.h b/icing/util/bit-util.h index e2bb817..7ca20b4 100644 --- a/icing/util/bit-util.h +++ b/icing/util/bit-util.h @@ -24,19 +24,18 @@ namespace bit_util { // Manipulating bit fields. // -// x value containing the bit field(s) -// offset offset of bit field in x -// len len of bit field in x +// value value containing the bit field(s) +// lsb_offset offset of bit field in value, starting from the least significant +// bit. for example, the '1' in '0100' has a lsb_offset of 2 +// len len of bit field in value // // REQUIREMENTS // -// - x an unsigned integer <= 64 bits -// - offset + len <= sizeof(x) * 8 +// - value is an unsigned integer <= 64 bits +// - lsb_offset + len <= sizeof(value) * 8 // // There is no error checking so you will get garbage if you don't // ensure the above. -// -// To set a value, use BITFIELD_CLEAR then BITFIELD_OR. // Shifting by more than the word length is undefined (on ARM it has the // intended effect, but on Intel it shifts by % word length), so check the @@ -44,20 +43,65 @@ namespace bit_util { inline uint64_t BitfieldMask(uint32_t len) { return ((len == 0) ? 0U : ((~uint64_t{0}) >> (64 - (len)))); } -inline uint64_t BitfieldGet(uint64_t mask, uint32_t lsb_offset, uint32_t len) { - return ((mask) >> (lsb_offset)) & BitfieldMask(len); + +inline void BitfieldClear(uint32_t lsb_offset, uint32_t len, + uint8_t* value_out) { + *value_out &= ~(BitfieldMask(len) << lsb_offset); +} + +inline void BitfieldClear(uint32_t lsb_offset, uint32_t len, + uint16_t* value_out) { + *value_out &= ~(BitfieldMask(len) << lsb_offset); +} + +inline void BitfieldClear(uint32_t lsb_offset, uint32_t len, + uint32_t* value_out) { + *value_out &= ~(BitfieldMask(len) << lsb_offset); +} + +inline void BitfieldClear(uint32_t lsb_offset, uint32_t len, + uint64_t* value_out) { + *value_out &= ~(BitfieldMask(len) << lsb_offset); +} + +inline uint64_t BitfieldGet(uint64_t value, uint32_t lsb_offset, uint32_t len) { + return ((value) >> (lsb_offset)) & BitfieldMask(len); +} + +inline void BitfieldSet(uint8_t new_value, uint32_t lsb_offset, uint32_t len, + uint8_t* value_out) { + BitfieldClear(lsb_offset, len, value_out); + + // We conservatively mask new_value at len so value won't be corrupted if + // new_value >= (1 << len). + *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset); +} + +inline void BitfieldSet(uint16_t new_value, uint32_t lsb_offset, uint32_t len, + uint16_t* value_out) { + BitfieldClear(lsb_offset, len, value_out); + + // We conservatively mask new_value at len so value won't be corrupted if + // new_value >= (1 << len). + *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset); } -inline void BitfieldSet(uint32_t value, uint32_t lsb_offset, uint32_t len, - uint32_t* mask) { - // We conservatively mask val at len so x won't be corrupted if val >= - // 1 << len. - *mask |= (uint64_t{value} & BitfieldMask(len)) << (lsb_offset); + +inline void BitfieldSet(uint32_t new_value, uint32_t lsb_offset, uint32_t len, + uint32_t* value_out) { + BitfieldClear(lsb_offset, len, value_out); + + // We conservatively mask new_value at len so value won't be corrupted if + // new_value >= (1 << len). + *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset); } -inline void BitfieldSet(uint64_t value, uint32_t lsb_offset, uint32_t len, - uint64_t* mask) { - // We conservatively mask val at len so x won't be corrupted if val >= - // 1 << len. - *mask |= (value & BitfieldMask(len)) << (lsb_offset); + +inline void BitfieldSet(uint64_t new_value, uint32_t lsb_offset, uint32_t len, + uint64_t* value_out) { + BitfieldClear(lsb_offset, len, value_out); + + // We conservatively mask new_value at len so value won't be corrupted if + // new_value >= (1 << len). + *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset); } } // namespace bit_util diff --git a/icing/util/bit-util_test.cc b/icing/util/bit-util_test.cc new file mode 100644 index 0000000..3b86a21 --- /dev/null +++ b/icing/util/bit-util_test.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/util/bit-util.h" + +#include <memory> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace icing { +namespace lib { +namespace { + +using ::testing::Eq; + +TEST(BitUtilTest, BitfieldMask) { + // Check that we can handle up to uint8_t's + EXPECT_THAT(bit_util::BitfieldMask(/*len=*/0), Eq(0b0)); + EXPECT_THAT(bit_util::BitfieldMask(/*len=*/1), Eq(0b01)); + + // Check that we can handle up to uint32_t's + EXPECT_THAT(bit_util::BitfieldMask(/*len=*/16), Eq(0b01111111111111111)); + + // Check that we can handle up to uint64_t's + EXPECT_THAT( + bit_util::BitfieldMask(/*len=*/63), + Eq(0b0111111111111111111111111111111111111111111111111111111111111111)); +} + +TEST(BitUtilTest, BitfieldClear) { + // Check that we can handle up to uint8_t's + uint8_t value_8 = 0b0; + bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b0)); + + value_8 = 0b01; + bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b00)); + + value_8 = 0b011; + bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b001)); + + value_8 = 0b011; + bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b000)); + + value_8 = 0b0110; + bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b0000)); + + // Check that we can handle up to uint32_t's + uint32_t value_32 = 0b010000000000000000000000; + bit_util::BitfieldClear(/*lsb_offset=*/22, /*len=*/1, &value_32); + EXPECT_THAT(value_32, Eq(0b0)); + + // Check that we can handle up to uint64_t's + uint64_t value_64 = 0b0100000000000000000000000000000000000; + bit_util::BitfieldClear(/*lsb_offset=*/35, /*len=*/1, &value_64); + EXPECT_THAT(value_64, Eq(0b0)); +} + +TEST(BitUtilTest, BitfieldGet) { + // Get something in the uint8_t range + EXPECT_THAT(bit_util::BitfieldGet(0b0, /*lsb_offset=*/0, /*len=*/1), Eq(0b0)); + EXPECT_THAT(bit_util::BitfieldGet(0b01, /*lsb_offset=*/0, /*len=*/1), + Eq(0b01)); + EXPECT_THAT(bit_util::BitfieldGet(0b010, /*lsb_offset=*/1, /*len=*/1), + Eq(0b01)); + EXPECT_THAT(bit_util::BitfieldGet(0b001, /*lsb_offset=*/1, /*len=*/1), + Eq(0b0)); + EXPECT_THAT(bit_util::BitfieldGet(0b011, /*lsb_offset=*/0, /*len=*/2), + Eq(0b011)); + EXPECT_THAT(bit_util::BitfieldGet(0b0110, /*lsb_offset=*/1, /*len=*/2), + Eq(0b011)); + EXPECT_THAT(bit_util::BitfieldGet(0b0101, /*lsb_offset=*/0, /*len=*/3), + Eq(0b0101)); + + // Get something in the uint32_t range + EXPECT_THAT( + bit_util::BitfieldGet(0b01000000000000, /*lsb_offset=*/12, /*len=*/1), + Eq(0b01)); + + // Get something in the uint64_t range + EXPECT_THAT(bit_util::BitfieldGet(0b010000000000000000000000000000000000, + /*lsb_offset=*/34, /*len=*/1), + Eq(0b01)); +} + +TEST(BitUtilTest, BitfieldSet) { + // Set something in the uint8_t range + uint8_t value_8 = 0b0; + bit_util::BitfieldSet(0b0, /*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b0)); + + value_8 = 0b01; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b01)); + + value_8 = 0b00; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b01)); + + value_8 = 0b00; + bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b011)); + + value_8 = 0b01; + bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b011)); + + value_8 = 0b01; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/1, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b011)); + + value_8 = 0b0001; + bit_util::BitfieldSet(0b011, /*lsb_offset=*/1, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b0111)); + + // Set something in the uint32_t range + uint32_t value_32 = 0b0; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/16, /*len=*/1, &value_32); + EXPECT_THAT(value_32, Eq(0b010000000000000000)); + + // Set something in the uint64_t range + uint64_t value_64 = 0b0; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/34, /*len=*/1, &value_64); + EXPECT_THAT(value_64, Eq(0b010000000000000000000000000000000000)); +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 6f5faa0..4069810 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=373174102) +set(synced_AOSP_CL_number=375495869) |