diff options
author | Tim Barron <tjbarron@google.com> | 2022-04-12 14:30:14 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2022-04-12 14:36:38 -0700 |
commit | d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54 (patch) | |
tree | 90b929dc92d5874b5c15caca064401196ab4fc65 | |
parent | beff93fe1f5165aeeb871d9711963aa1846299ae (diff) | |
download | icing-d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54.tar.gz |
Sync from upstream.
======================================================================
Refactor DocumentStore::Initialize to improve readability of document store recovery.
======================================================================
Remove non-NDK API usages of ICU4C in libicing.
======================================================================
Move IcuDataFileHelper to the testing directory since it is a test-only util.
======================================================================
Support dump function for DocumentStore
======================================================================
Switch to use PRead rather than MMap in the proto log.
======================================================================
Support dump function for main/lite index and lexicon
======================================================================
Fix LiteIndex::AppendHits
======================================================================
Enable and fix DocumentStoreTest.LoadScoreCacheAndInitializeSuccessfully
======================================================================
Fix MainIndex::GetStorageInfo.
======================================================================
Fix icing-search-engine_fuzz_test by making IcuLanguageSegmenterIterator::Advance non-recursive.
======================================================================
Allow to return additional information for deleted documents in DeleteByQuery
======================================================================
Using enum class in Token::Type for better type safety.
======================================================================
Bug: 158089703
Bug: 185845269
Bug: 209071710
Bug: 211785521
Bug: 218413237
Bug: 223549255
Change-Id: Id2786047ab279734bdd2aee883e82607b6a0e403
57 files changed, 1469 insertions, 753 deletions
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc index f1e568c..eec7668 100644 --- a/icing/file/file-backed-bitmap.cc +++ b/icing/file/file-backed-bitmap.cc @@ -50,7 +50,7 @@ FileBackedBitmap::Create(const Filesystem* filesystem, auto bitmap = std::unique_ptr<FileBackedBitmap>( new FileBackedBitmap(filesystem, file_path, mmap_strategy)); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = bitmap->Initialize(); if (!status.ok()) { @@ -122,7 +122,7 @@ libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() { << " of size: " << file_size; } - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, file_size); if (!status.ok()) { @@ -198,7 +198,7 @@ int FileBackedBitmap::NumBits() const { libtextclassifier3::Status FileBackedBitmap::Set(int bit_index, bool bit_value) { if (bit_index >= NumBits()) { - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = GrowTo(bit_index); if (!status.ok()) { @@ -261,7 +261,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) { file_path_.c_str(), new_file_size)); } - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size); if (!status.ok()) { @@ -281,7 +281,7 @@ libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) { } const size_t new_file_size = FileSizeForBits(new_num_bits); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size); if (!status.ok()) { diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h index ca8c4a8..dd2c5d1 100644 --- a/icing/file/filesystem.h +++ b/icing/file/filesystem.h @@ -233,6 +233,11 @@ class Filesystem { // Increments to_increment by size if size is valid, or sets to_increment // to kBadFileSize if either size or to_increment is kBadFileSize. static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment); + + // Return -1 if file_size is invalid. Otherwise, return file_size. + static int64_t SanitizeFileSize(int64_t file_size) { + return (file_size != kBadFileSize) ? file_size : -1; + } }; // LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h) diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h index f676dc5..409ab96 100644 --- a/icing/file/portable-file-backed-proto-log.h +++ b/icing/file/portable-file-backed-proto-log.h @@ -124,6 +124,8 @@ class PortableFileBackedProtoLog { public: static constexpr int32_t kMagic = 0xf4c6f67a; + // We should go directly from 0 to 2 the next time we have to change the + // format. static constexpr int32_t kFileFormatVersion = 0; uint32_t CalculateHeaderChecksum() const { @@ -282,7 +284,7 @@ class PortableFileBackedProtoLog { // before updating our checksum. bool recalculated_checksum = false; - bool has_data_loss() { + bool has_data_loss() const { return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE; } }; @@ -376,8 +378,7 @@ class PortableFileBackedProtoLog { // } class Iterator { public: - Iterator(const Filesystem& filesystem, const std::string& file_path, - int64_t initial_offset); + Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset); // Advances to the position of next proto whether it has been erased or not. // @@ -393,11 +394,12 @@ class PortableFileBackedProtoLog { private: static constexpr int64_t kInvalidOffset = -1; // Used to read proto metadata - MemoryMappedFile mmapped_file_; // Offset of first proto + const Filesystem* const filesystem_; int64_t initial_offset_; int64_t current_offset_; int64_t file_size_; + int fd_; }; // Returns an iterator of current proto log. The caller needs to keep the @@ -513,7 +515,7 @@ class PortableFileBackedProtoLog { const Filesystem* filesystem, const std::string& file_path, Crc32 initial_crc, int64_t start, int64_t end); - // Reads out the metadata of a proto located at file_offset from the file. + // Reads out the metadata of a proto located at file_offset from the fd. // Metadata will be returned in host byte order endianness. // // Returns: @@ -521,7 +523,8 @@ class PortableFileBackedProtoLog { // OUT_OF_RANGE_ERROR if file_offset exceeds file_size // INTERNAL_ERROR if the metadata is invalid or any IO errors happen static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata( - MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); + const Filesystem* const filesystem, int fd, int64_t file_offset, + int64_t file_size); // Writes metadata of a proto to the fd. Takes in a host byte order endianness // metadata and converts it into a portable metadata before writing. @@ -937,35 +940,37 @@ template <typename ProtoT> libtextclassifier3::StatusOr<ProtoT> PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const { int64_t file_size = filesystem_->GetFileSize(fd_.get()); - MemoryMappedFile mmapped_file(*filesystem_, file_path_, - MemoryMappedFile::Strategy::READ_ONLY); - if (file_offset >= file_size) { - // file_size points to the next byte to write at, so subtract one to get - // the inclusive, actual size of file. - return absl_ports::OutOfRangeError( - IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " - "out of range of the file size, %lld", - static_cast<long long>(file_offset), - static_cast<long long>(file_size - 1))); - } - // Read out the metadata + if (file_size == Filesystem::kBadFileSize) { + return absl_ports::OutOfRangeError("Unable to correctly read size."); + } ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size)); // Copy out however many bytes it says the proto is int stored_size = GetProtoSize(metadata); + file_offset += sizeof(metadata); - ICING_RETURN_IF_ERROR( - mmapped_file.Remap(file_offset + sizeof(metadata), stored_size)); + // Read the compressed proto out. + if (file_offset + stored_size > file_size) { + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + auto buf = std::make_unique<char[]>(stored_size); + if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) { + return absl_ports::InternalError(""); + } - if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) { + if (IsEmptyBuffer(buf.get(), stored_size)) { return absl_ports::NotFoundError("The proto data has been erased."); } - google::protobuf::io::ArrayInputStream proto_stream( - mmapped_file.mutable_region(), stored_size); + google::protobuf::io::ArrayInputStream proto_stream(buf.get(), + stored_size); // Deserialize proto ProtoT proto; @@ -983,33 +988,29 @@ template <typename ProtoT> libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto( int64_t file_offset) { int64_t file_size = filesystem_->GetFileSize(fd_.get()); - if (file_offset >= file_size) { - // file_size points to the next byte to write at, so subtract one to get - // the inclusive, actual size of file. - return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( - "Trying to erase data at a location, %lld, " - "out of range of the file size, %lld", - static_cast<long long>(file_offset), - static_cast<long long>(file_size - 1))); + if (file_size == Filesystem::kBadFileSize) { + return absl_ports::OutOfRangeError("Unable to correctly read size."); } - MemoryMappedFile mmapped_file( - *filesystem_, file_path_, - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); - - // Read out the metadata ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(&mmapped_file, file_offset, file_size)); - - ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), - GetProtoSize(metadata))); + ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size)); + // Copy out however many bytes it says the proto is + int stored_size = GetProtoSize(metadata); + file_offset += sizeof(metadata); + if (file_offset + stored_size > file_size) { + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + auto buf = std::make_unique<char[]>(stored_size); // We need to update the crc checksum if the erased area is before the // rewind position. int32_t new_crc; - int64_t erased_proto_offset = file_offset + sizeof(metadata); - if (erased_proto_offset < header_->GetRewindOffset()) { + if (file_offset < header_->GetRewindOffset()) { // Set to "dirty" before we start writing anything. header_->SetDirtyFlag(true); header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); @@ -1022,24 +1023,30 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto( // We need to calculate [original string xor 0s]. // The xored string is the same as the original string because 0 xor 0 = // 0, 1 xor 0 = 1. - const std::string_view xored_str(mmapped_file.region(), - mmapped_file.region_size()); + // Read the compressed proto out. + if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) { + return absl_ports::InternalError(""); + } + const std::string_view xored_str(buf.get(), stored_size); Crc32 crc(header_->GetLogChecksum()); ICING_ASSIGN_OR_RETURN( - new_crc, crc.UpdateWithXor( - xored_str, - /*full_data_size=*/header_->GetRewindOffset() - - kHeaderReservedBytes, - /*position=*/erased_proto_offset - kHeaderReservedBytes)); + new_crc, + crc.UpdateWithXor(xored_str, + /*full_data_size=*/header_->GetRewindOffset() - + kHeaderReservedBytes, + /*position=*/file_offset - kHeaderReservedBytes)); } // Clear the region. - memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size()); + memset(buf.get(), '\0', stored_size); + if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) { + return absl_ports::InternalError(""); + } // If we cleared something in our checksummed area, we should update our // checksum and reset our dirty bit. - if (erased_proto_offset < header_->GetRewindOffset()) { + if (file_offset < header_->GetRewindOffset()) { header_->SetDirtyFlag(false); header_->SetLogChecksum(new_crc); header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); @@ -1077,13 +1084,12 @@ PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const { template <typename ProtoT> PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator( - const Filesystem& filesystem, const std::string& file_path, - int64_t initial_offset) - : mmapped_file_(filesystem, file_path, - MemoryMappedFile::Strategy::READ_ONLY), + const Filesystem& filesystem, int fd, int64_t initial_offset) + : filesystem_(&filesystem), initial_offset_(initial_offset), current_offset_(kInvalidOffset), - file_size_(filesystem.GetFileSize(file_path.c_str())) { + fd_(fd) { + file_size_ = filesystem_->GetFileSize(fd_); if (file_size_ == Filesystem::kBadFileSize) { // Fails all Advance() calls file_size_ = 0; @@ -1100,7 +1106,7 @@ PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() { // Jumps to the next proto position ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_)); + ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_)); current_offset_ += sizeof(metadata) + GetProtoSize(metadata); } @@ -1122,14 +1128,15 @@ int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() { template <typename ProtoT> typename PortableFileBackedProtoLog<ProtoT>::Iterator PortableFileBackedProtoLog<ProtoT>::GetIterator() { - return Iterator(*filesystem_, file_path_, + return Iterator(*filesystem_, fd_.get(), /*initial_offset=*/kHeaderReservedBytes); } template <typename ProtoT> libtextclassifier3::StatusOr<int32_t> PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata( - MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) { + const Filesystem* const filesystem, int fd, int64_t file_offset, + int64_t file_size) { // Checks file_offset if (file_offset >= file_size) { return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( @@ -1147,9 +1154,9 @@ PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata( static_cast<long long>(file_size))); } - // Reads metadata - ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size)); - memcpy(&portable_metadata, mmapped_file->region(), metadata_size); + if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) { + return absl_ports::InternalError(""); + } // Need to switch it back to host order endianness after reading from disk. int32_t host_order_metadata = GNetworkToHostL(portable_metadata); diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc index f83ccd6..80a8011 100644 --- a/icing/file/portable-file-backed-proto-log_benchmark.cc +++ b/icing/file/portable-file-backed-proto-log_benchmark.cc @@ -55,7 +55,7 @@ namespace lib { namespace { -static void BM_Write(benchmark::State& state) { +void BM_Write(benchmark::State& state) { const Filesystem filesystem; int string_length = state.range(0); const std::string file_path = IcingStringUtil::StringPrintf( @@ -108,7 +108,7 @@ BENCHMARK(BM_Write) // 16MiB, and we need some extra space for the // rest of the document properties -static void BM_Read(benchmark::State& state) { +void BM_Read(benchmark::State& state) { const Filesystem filesystem; int string_length = state.range(0); const std::string file_path = IcingStringUtil::StringPrintf( @@ -164,7 +164,7 @@ BENCHMARK(BM_Read) // 16MiB, and we need some extra space for the // rest of the document properties // -static void BM_Erase(benchmark::State& state) { +void BM_Erase(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = IcingStringUtil::StringPrintf( "%s%s", GetTestTempDir().c_str(), "/proto.log"); @@ -204,7 +204,7 @@ static void BM_Erase(benchmark::State& state) { } BENCHMARK(BM_Erase); -static void BM_ComputeChecksum(benchmark::State& state) { +void BM_ComputeChecksum(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = GetTestTempDir() + "/proto.log"; int max_proto_size = (1 << 24) - 1; // 16 MiB @@ -246,7 +246,7 @@ static void BM_ComputeChecksum(benchmark::State& state) { } BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20); -static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) { +void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = GetTestTempDir() + "/proto.log"; int max_proto_size = (1 << 24) - 1; // 16 MiB @@ -290,7 +290,7 @@ static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) { } BENCHMARK(BM_ComputeChecksumWithCachedChecksum); -static void BM_ComputeChecksumOnlyForTail(benchmark::State& state) { +void BM_ComputeChecksumOnlyForTail(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = GetTestTempDir() + "/proto.log"; int max_proto_size = (1 << 24) - 1; // 16 MiB diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc index b5fee4b..795271a 100644 --- a/icing/file/portable-file-backed-proto-log_test.cc +++ b/icing/file/portable-file-backed-proto-log_test.cc @@ -851,11 +851,12 @@ TEST_F(PortableFileBackedProtoLogTest, Iterator) { { // Iterator with bad filesystem + ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str())); MockFilesystem mock_filesystem; - ON_CALL(mock_filesystem, GetFileSize(A<const char*>())) + ON_CALL(mock_filesystem, GetFileSize(A<int>())) .WillByDefault(Return(Filesystem::kBadFileSize)); PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator( - mock_filesystem, file_path_, /*initial_offset=*/0); + mock_filesystem, sfd.get(), /*initial_offset=*/0); ASSERT_THAT(bad_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 6acdd31..952ba21 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -18,6 +18,7 @@ #include <memory> #include <string> #include <string_view> +#include <unordered_map> #include <utility> #include <vector> @@ -88,6 +89,23 @@ constexpr std::string_view kOptimizeStatusFilename = "optimize_status"; // fresh state. constexpr int kMaxUnsuccessfulInitAttempts = 5; +// A pair that holds namespace and type. +struct NamespaceTypePair { + std::string namespace_; + std::string type; + + bool operator==(const NamespaceTypePair& other) const { + return namespace_ == other.namespace_ && type == other.type; + } +}; + +struct NamespaceTypePairHasher { + std::size_t operator()(const NamespaceTypePair& pair) const { + return std::hash<std::string>()(pair.namespace_) ^ + std::hash<std::string>()(pair.type); + } +}; + libtextclassifier3::Status ValidateResultSpec( const ResultSpecProto& result_spec) { if (result_spec.num_per_page() < 0) { @@ -255,6 +273,28 @@ void TransformStatus(const libtextclassifier3::Status& internal_status, status_proto->set_message(internal_status.error_message()); } +libtextclassifier3::Status RetrieveAndAddDocumentInfo( + const DocumentStore* document_store, DeleteByQueryResultProto& result_proto, + std::unordered_map<NamespaceTypePair, + DeleteByQueryResultProto::DocumentGroupInfo*, + NamespaceTypePairHasher>& info_map, + DocumentId document_id) { + ICING_ASSIGN_OR_RETURN(DocumentProto document, + document_store->Get(document_id)); + NamespaceTypePair key = {document.namespace_(), document.schema()}; + auto iter = info_map.find(key); + if (iter == info_map.end()) { + auto entry = result_proto.add_deleted_documents(); + entry->set_namespace_(std::move(document.namespace_())); + entry->set_schema(std::move(document.schema())); + entry->add_uris(std::move(document.uri())); + info_map[key] = entry; + } else { + iter->second->add_uris(std::move(document.uri())); + } + return libtextclassifier3::Status::OK; +} + } // namespace IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options, @@ -443,8 +483,6 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( // last tried to set the schema. ICING_RETURN_IF_ERROR(InitializeDocumentStore( /*force_recovery_and_revalidate_documents=*/true, initialize_stats)); - initialize_stats->set_document_store_recovery_cause( - InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); // We're going to need to build the index from scratch. So just delete its // files now. @@ -934,7 +972,7 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space, delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = document_store_->Delete(name_space, uri); if (!status.ok()) { @@ -968,7 +1006,7 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace( delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. DocumentStore::DeleteByGroupResult doc_store_result = document_store_->DeleteByNamespace(name_space); @@ -1002,7 +1040,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. DocumentStore::DeleteByGroupResult doc_store_result = document_store_->DeleteBySchemaType(schema_type); @@ -1020,7 +1058,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( } DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( - const SearchSpecProto& search_spec) { + const SearchSpecProto& search_spec, bool return_deleted_document_info) { ICING_VLOG(1) << "Deleting documents for query " << search_spec.query() << " from doc store"; @@ -1074,12 +1112,27 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( ICING_VLOG(2) << "Deleting the docs that matched the query."; int num_deleted = 0; + // A map used to group deleted documents. + // From the (namespace, type) pair to a list of uris. + std::unordered_map<NamespaceTypePair, + DeleteByQueryResultProto::DocumentGroupInfo*, + NamespaceTypePairHasher> + deleted_info_map; component_timer = clock_->GetNewTimer(); while (query_results.root_iterator->Advance().ok()) { ICING_VLOG(3) << "Deleting doc " << query_results.root_iterator->doc_hit_info().document_id(); ++num_deleted; + if (return_deleted_document_info) { + status = RetrieveAndAddDocumentInfo( + document_store_.get(), result_proto, deleted_info_map, + query_results.root_iterator->doc_hit_info().document_id()); + if (!status.ok()) { + TransformStatus(status, result_status); + return result_proto; + } + } status = document_store_->Delete( query_results.root_iterator->doc_hit_info().document_id()); if (!status.ok()) { @@ -1148,12 +1201,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() { std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer(); OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats(); int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - if (before_size != Filesystem::kBadFileSize) { - optimize_stats->set_storage_size_before(before_size); - } else { - // Set -1 as a sentinel value when failures occur. - optimize_stats->set_storage_size_before(-1); - } + optimize_stats->set_storage_size_before( + Filesystem::SanitizeFileSize(before_size)); // Flushes data to disk before doing optimization auto status = InternalPersistToDisk(PersistType::FULL); @@ -1230,12 +1279,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() { optimize_status_file.Write(std::move(optimize_status)); int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - if (after_size != Filesystem::kBadFileSize) { - optimize_stats->set_storage_size_after(after_size); - } else { - // Set -1 as a sentinel value when failures occur. - optimize_stats->set_storage_size_after(-1); - } + optimize_stats->set_storage_size_after( + Filesystem::SanitizeFileSize(after_size)); optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds()); TransformStatus(optimization_status, result_status); @@ -1317,11 +1362,8 @@ StorageInfoResultProto IcingSearchEngine::GetStorageInfo() { } int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - if (index_size != Filesystem::kBadFileSize) { - result.mutable_storage_info()->set_total_storage_size(index_size); - } else { - result.mutable_storage_info()->set_total_storage_size(-1); - } + result.mutable_storage_info()->set_total_storage_size( + Filesystem::SanitizeFileSize(index_size)); *result.mutable_storage_info()->mutable_document_storage_info() = document_store_->GetStorageInfo(); *result.mutable_storage_info()->mutable_schema_store_storage_info() = diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index 0a79714..ff9c7fb 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -280,8 +280,9 @@ class IcingSearchEngine { // NOT_FOUND if the query doesn't match any documents // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on IO error - DeleteByQueryResultProto DeleteByQuery(const SearchSpecProto& search_spec) - ICING_LOCKS_EXCLUDED(mutex_); + DeleteByQueryResultProto DeleteByQuery( + const SearchSpecProto& search_spec, + bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_); // Retrieves, scores, ranks, and returns the results according to the specs. // Results can be empty. If there're multiple pages of results, diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc index 2a44f8a..bf486da 100644 --- a/icing/icing-search-engine_fuzz_test.cc +++ b/icing/icing-search-engine_fuzz_test.cc @@ -18,12 +18,12 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/document-builder.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/icing-search-engine.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" #include "icing/proto/scoring.pb.h" #include "icing/schema-builder.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index 4f16bb6..7ed8885 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -27,7 +27,6 @@ #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" @@ -46,6 +45,7 @@ #include "icing/store/document-log-creator.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/random-string.h" #include "icing/testing/snippet-helpers.h" @@ -3493,6 +3493,105 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) { expected_search_result_proto)); } +TEST_F(IcingSearchEngineTest, DeleteByQueryReturnInfo) { + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document3 = + DocumentBuilder() + .SetKey("namespace2", "uri3") + .SetSchema("Message") + .AddStringProperty("body", "message body3") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(7); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document1; + EXPECT_THAT( + icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT( + icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document3; + EXPECT_THAT( + icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + + // Delete all docs to test the information is correctly grouped. + SearchSpecProto search_spec; + search_spec.set_query("message"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + DeleteByQueryResultProto result_proto = + icing.DeleteByQuery(search_spec, true); + EXPECT_THAT(result_proto.status(), ProtoIsOk()); + DeleteByQueryStatsProto exp_stats; + exp_stats.set_latency_ms(7); + exp_stats.set_num_documents_deleted(3); + exp_stats.set_query_length(search_spec.query().length()); + exp_stats.set_num_terms(1); + exp_stats.set_num_namespaces_filtered(0); + exp_stats.set_num_schema_types_filtered(0); + exp_stats.set_parse_query_latency_ms(7); + exp_stats.set_document_removal_latency_ms(7); + EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats)); + + // Check that DeleteByQuery can return information for deleted documents. + DeleteByQueryResultProto::DocumentGroupInfo info1, info2; + info1.set_namespace_("namespace1"); + info1.set_schema("Message"); + info1.add_uris("uri1"); + info2.set_namespace_("namespace2"); + info2.set_schema("Message"); + info2.add_uris("uri3"); + info2.add_uris("uri2"); + EXPECT_THAT(result_proto.deleted_documents(), + UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2))); + + EXPECT_THAT( + icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()) + .status() + .code(), + Eq(StatusProto::NOT_FOUND)); + EXPECT_THAT( + icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()) + .status() + .code(), + Eq(StatusProto::NOT_FOUND)); + EXPECT_THAT( + icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()) + .status() + .code(), + Eq(StatusProto::NOT_FOUND)); +} + TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) { DocumentProto document1 = DocumentBuilder() @@ -8489,7 +8588,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) { EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), Eq(InitializeStatsProto::NO_DATA_LOSS)); EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), - Eq(InitializeStatsProto::NONE)); + Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT)); EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), Eq(InitializeStatsProto::NONE)); EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 6e072c7..1aad7d0 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -16,7 +16,6 @@ #include "gmock/gmock.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" #include "icing/legacy/core/icing-string-util.h" @@ -24,6 +23,7 @@ #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index d21a06f..bd310de 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -30,7 +30,6 @@ #include "icing/absl_ports/str_join.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -49,6 +48,7 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/random-string.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" diff --git a/icing/index/index.cc b/icing/index/index.cc index 984dc78..02ba699 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -119,7 +119,7 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas( int total_est_hit_count = lite_term_itr->hit_count + main_term_itr->hit_count; PushToTermHeap(TermMetadata(std::move(lite_term_itr->content), - total_est_hit_count), + total_est_hit_count), num_to_return, merged_term_metadata_heap); ++lite_term_itr; ++main_term_itr; @@ -259,11 +259,7 @@ Index::FindTermsByPrefix(const std::string& prefix, int num_to_return, IndexStorageInfoProto Index::GetStorageInfo() const { IndexStorageInfoProto storage_info; int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str()); - if (directory_size != Filesystem::kBadFileSize) { - storage_info.set_index_size(directory_size); - } else { - storage_info.set_index_size(-1); - } + storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size)); storage_info = lite_index_->GetStorageInfo(std::move(storage_info)); return main_index_->GetStorageInfo(std::move(storage_info)); } diff --git a/icing/index/index.h b/icing/index/index.h index 4aa9ba3..5c53349 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -32,6 +32,7 @@ #include "icing/index/term-id-codec.h" #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" @@ -143,9 +144,14 @@ class Index { // index. // verbosity > 0, more detailed debug information including raw postings // lists. - void GetDebugInfo(int verbosity, std::string* out) const { - lite_index_->GetDebugInfo(verbosity, out); - main_index_->GetDebugInfo(verbosity, out); + IndexDebugInfoProto GetDebugInfo(int verbosity) const { + IndexDebugInfoProto debug_info; + *debug_info.mutable_index_storage_info() = GetStorageInfo(); + *debug_info.mutable_lite_index_info() = + lite_index_->GetDebugInfo(verbosity); + *debug_info.mutable_main_index_info() = + main_index_->GetDebugInfo(verbosity); + return debug_info; } // Returns the byte size of the all the elements held in the index. This diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index cd64202..8355c01 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -31,6 +31,7 @@ #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" @@ -1394,12 +1395,14 @@ TEST_F(IndexTest, GetDebugInfo) { EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId1); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId2); ASSERT_THAT(edit.BufferTerm("footer"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, @@ -1407,40 +1410,45 @@ TEST_F(IndexTest, GetDebugInfo) { ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - std::string out0; - index_->GetDebugInfo(/*verbosity=*/0, &out0); - EXPECT_THAT(out0, Not(IsEmpty())); + IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info()); + EXPECT_THAT(out0.main_index_info().last_added_document_id(), + Eq(kDocumentId1)); + EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2)); + EXPECT_THAT(out0.lite_index_info().last_added_document_id(), + Eq(kDocumentId2)); - std::string out1; - index_->GetDebugInfo(/*verbosity=*/1, &out1); - EXPECT_THAT(out1, SizeIs(Gt(out0.size()))); + IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1); + EXPECT_THAT(out1.main_index_info().flash_index_storage_info(), + Not(IsEmpty())); // Add one more doc to the lite index. Debug strings should change. edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId3); ASSERT_THAT(edit.BufferTerm("far"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - std::string out2; - index_->GetDebugInfo(/*verbosity=*/0, &out2); - EXPECT_THAT(out2, Ne(out0)); - - std::string out3; - index_->GetDebugInfo(/*verbosity=*/1, &out3); - EXPECT_THAT(out3, Ne(out1)); + IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3)); + EXPECT_THAT(out2.lite_index_info().last_added_document_id(), + Eq(kDocumentId3)); // Merge into the man index. Debuug strings should change again. ICING_ASSERT_OK(index_->Merge()); - std::string out4; - index_->GetDebugInfo(/*verbosity=*/0, &out4); - EXPECT_THAT(out4, Ne(out0)); - EXPECT_THAT(out4, Ne(out2)); - - std::string out5; - index_->GetDebugInfo(/*verbosity=*/1, &out5); - EXPECT_THAT(out5, Ne(out1)); - EXPECT_THAT(out5, Ne(out3)); + IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_TRUE(out3.has_index_storage_info()); + EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty())); + EXPECT_THAT(out3.main_index_info().last_added_document_id(), + Eq(kDocumentId3)); + EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0)); + EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0)); + EXPECT_THAT(out3.lite_index_info().last_added_document_id(), + Eq(kInvalidDocumentId)); + EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0)); + EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0)); + EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty())); } TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) { diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index e7a8cb3..a5c6baf 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -340,6 +340,8 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, std::vector<DocHitInfo>* hits_out) { int count = 0; DocumentId last_document_id = kInvalidDocumentId; + // Record whether the last document belongs to the given namespaces. + bool last_document_in_namespace = false; for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) { TermIdHitPair term_id_hit_pair( hit_buffer_.array_cast<TermIdHitPair>()[idx]); @@ -357,9 +359,10 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, DocumentId document_id = hit.document_id(); if (document_id != last_document_id) { last_document_id = document_id; - // Check does current document belongs to the given namespaces. - if (namespace_checker != nullptr && - !namespace_checker->BelongsToTargetNamespaces(document_id)) { + last_document_in_namespace = + namespace_checker == nullptr || + namespace_checker->BelongsToTargetNamespaces(document_id); + if (!last_document_in_namespace) { // The document is removed or expired or not belongs to target // namespaces. continue; @@ -369,7 +372,7 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, hits_out->push_back(DocHitInfo(document_id)); } } - if (hits_out != nullptr) { + if (hits_out != nullptr && last_document_in_namespace) { hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency()); } } @@ -388,15 +391,16 @@ bool LiteIndex::is_full() const { lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction)); } -void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { - absl_ports::StrAppend( - out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n", - header_->cur_size(), - options_.hit_buffer_size)); - - // Lexicon. - out->append("Lexicon stats:\n"); - lexicon_.GetDebugInfo(verbosity, out); +IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo( + int verbosity) { + IndexDebugInfoProto::LiteIndexDebugInfoProto res; + res.set_curr_size(header_->cur_size()); + res.set_hit_buffer_size(options_.hit_buffer_size); + res.set_last_added_document_id(header_->last_added_docid()); + res.set_searchable_end(header_->searchable_end()); + res.set_index_crc(ComputeChecksum().Get()); + lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info()); + return res; } libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const { @@ -417,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { int64_t header_and_hit_buffer_file_size = filesystem_->GetFileSize(hit_buffer_fd_.get()); - if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) { - storage_info.set_lite_index_hit_buffer_size( - header_and_hit_buffer_file_size); - } else { - storage_info.set_lite_index_hit_buffer_size(-1); - } + storage_info.set_lite_index_hit_buffer_size( + IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size)); int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); if (lexicon_disk_usage != Filesystem::kBadFileSize) { storage_info.set_lite_index_lexicon_size(lexicon_disk_usage); diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 890980c..378fc94 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -37,6 +37,7 @@ #include "icing/legacy/index/icing-lite-index-header.h" #include "icing/legacy/index/icing-lite-index-options.h" #include "icing/legacy/index/icing-mmapper.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" @@ -241,7 +242,7 @@ class LiteIndex { // Returns debug information for the index in out. // verbosity <= 0, simplest debug information - size of lexicon, hit buffer // verbosity > 0, more detailed debug information from the lexicon. - void GetDebugInfo(int verbosity, std::string* out) const; + IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity); // Returns the byte size of all the elements held in the index. This excludes // the size of any internal metadata of the index, e.g. the index's header. diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc new file mode 100644 index 0000000..825f830 --- /dev/null +++ b/icing/index/lite/lite-index_test.cc @@ -0,0 +1,110 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/lite/lite-index.h" + +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/schema/section.h" +#include "icing/store/namespace-checker.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::SizeIs; + +class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker { + public: + bool BelongsToTargetNamespaces(DocumentId document_id) const override { + return false; + } +}; + +class LiteIndexTest : public testing::Test { + protected: + void SetUp() override { + index_dir_ = GetTestTempDir() + "/test_dir"; + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); + + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024); + ICING_ASSERT_OK_AND_ASSIGN(lite_index_, + LiteIndex::Create(options, &icing_filesystem_)); + + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + } + + void TearDown() override { + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); + } + + std::string index_dir_; + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + std::unique_ptr<LiteIndex> lite_index_; + std::unique_ptr<TermIdCodec> term_id_codec_; +}; + +constexpr NamespaceId kNamespace0 = 0; + +TEST_F(LiteIndexTest, LiteIndexAppendHits) { + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); + + std::vector<DocHitInfo> hits1; + lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(1)); + EXPECT_THAT(hits1.back().document_id(), Eq(0)); + // Check that the hits are coming from section 0 and section 1. + EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker; + lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + &always_false_namespace_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h index 8d5b50b..6c6fbb8 100644 --- a/icing/index/main/flash-index-storage.h +++ b/icing/index/main/flash-index-storage.h @@ -159,6 +159,7 @@ class FlashIndexStorage { libtextclassifier3::Status Reset(); + // TODO(b/222349894) Convert the string output to a protocol buffer instead. void GetDebugInfo(int verbosity, std::string* out) const; private: diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index c104dd3..2d6007b 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -133,18 +133,10 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const { IndexStorageInfoProto MainIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { - int64_t lexicon_elt_size = main_lexicon_->GetElementsSize(); - if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { - storage_info.set_main_index_lexicon_size(lexicon_elt_size); - } else { - storage_info.set_main_index_lexicon_size(-1); - } - int64_t index_elt_size = flash_index_storage_->GetElementsSize(); - if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { - storage_info.set_main_index_storage_size(index_elt_size); - } else { - storage_info.set_main_index_storage_size(-1); - } + storage_info.set_main_index_lexicon_size( + IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize())); + storage_info.set_main_index_storage_size( + Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize())); storage_info.set_main_index_block_size(flash_index_storage_->block_size()); storage_info.set_num_blocks(flash_index_storage_->num_blocks()); storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction()); @@ -615,16 +607,22 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits( return libtextclassifier3::Status::OK; } -void MainIndex::GetDebugInfo(int verbosity, std::string* out) const { +IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo( + int verbosity) const { + IndexDebugInfoProto::MainIndexDebugInfoProto res; + // Lexicon. - out->append("Main Lexicon stats:\n"); - main_lexicon_->GetDebugInfo(verbosity, out); + main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info()); + + res.set_last_added_document_id(last_added_document_id()); if (verbosity <= 0) { - return; + return res; } - flash_index_storage_->GetDebugInfo(verbosity, out); + flash_index_storage_->GetDebugInfo(verbosity, + res.mutable_flash_index_storage_info()); + return res; } } // namespace lib diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h index e4073cb..abb0418 100644 --- a/icing/index/main/main-index.h +++ b/icing/index/main/main-index.h @@ -27,6 +27,7 @@ #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" @@ -185,7 +186,8 @@ class MainIndex { // verbosity <= 0, simplest debug information - just the lexicon // verbosity > 0, more detailed debug information including raw postings // lists. - void GetDebugInfo(int verbosity, std::string* out) const; + IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo( + int verbosity) const; private: libtextclassifier3::Status Init(const std::string& index_directory, diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc index baa043a..77876c4 100644 --- a/icing/legacy/index/icing-dynamic-trie.cc +++ b/icing/legacy/index/icing-dynamic-trie.cc @@ -70,6 +70,7 @@ #include <algorithm> #include <cerrno> #include <cinttypes> +#include <cstdint> #include <cstring> #include <memory> #include <utility> @@ -397,6 +398,8 @@ class IcingDynamicTrie::IcingDynamicTrieStorage { // storage. IcingScopedFd array_fds_[NUM_ARRAY_TYPES]; std::vector<IcingArrayStorage> array_storage_; + + // Legacy file system. Switch to use the new Filesystem class instead. const IcingFilesystem *filesystem_; }; @@ -1364,10 +1367,12 @@ uint32_t IcingDynamicTrie::size() const { return storage_->hdr().num_keys(); } -void IcingDynamicTrie::CollectStatsRecursive(const Node &node, - Stats *stats) const { +void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats, + uint32_t depth) const { if (node.is_leaf()) { stats->num_leaves++; + stats->sum_depth += depth; + stats->max_depth = max(stats->max_depth, depth); const char *suffix = storage_->GetSuffix(node.next_index()); stats->suffixes_used += strlen(suffix) + 1 + value_size(); if (!suffix[0]) { @@ -1379,13 +1384,16 @@ void IcingDynamicTrie::CollectStatsRecursive(const Node &node, for (; i < (1U << node.log2_num_children()); i++) { const Next &next = *storage_->GetNext(node.next_index(), i); if (next.node_index() == kInvalidNodeIndex) break; - CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats); + CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats, + depth + 1); } // At least one valid node in each next array if (i == 0) { ICING_LOG(FATAL) << "No valid node in 'next' array"; } + stats->sum_children += i; + stats->max_children = max(stats->max_children, i); stats->child_counts[i - 1]++; stats->wasted[node.log2_num_children()] += @@ -1467,9 +1475,12 @@ std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const { "Wasted total: %u\n" "Num intermediates %u num leaves %u " "suffixes used %u null %u\n" + "avg and max children for intermediates: %.3f, %u\n" + "avg and max depth for leaves: %.3f, %u\n" "Total next frag: %.3f%%\n", total_wasted, num_intermediates, num_leaves, suffixes_used, - null_suffixes, + null_suffixes, 1. * sum_children / num_intermediates, max_children, + 1. * sum_depth / num_leaves, max_depth, 100. * math_util::SafeDivide((total_free + total_wasted), num_nexts)); } IcingStringUtil::SStringAppendF( diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h index 8821799..013b926 100644 --- a/icing/legacy/index/icing-dynamic-trie.h +++ b/icing/legacy/index/icing-dynamic-trie.h @@ -152,8 +152,13 @@ class IcingDynamicTrie : public IIcingStorage { uint32_t max_nodes; // Count of intermediate nodes. uint32_t num_intermediates; + // Total and maximum number of children of intermediate nodes. + uint32_t sum_children, max_children; + // Count of leaf nodes. uint32_t num_leaves; + // Total and maximum depth of leaf nodes. + uint32_t sum_depth, max_depth; // Next stats @@ -186,6 +191,7 @@ class IcingDynamicTrie : public IIcingStorage { uint32_t dirty_pages_nexts; uint32_t dirty_pages_suffixes; + // TODO(b/222349894) Convert the string output to a protocol buffer instead. std::string DumpStats(int verbosity) const; }; @@ -601,7 +607,8 @@ class IcingDynamicTrie : public IIcingStorage { static const uint32_t kInvalidSuffixIndex; // Stats helpers. - void CollectStatsRecursive(const Node &node, Stats *stats) const; + void CollectStatsRecursive(const Node &node, Stats *stats, + uint32_t depth = 0) const; // Helpers for Find and Insert. const Next *GetNextByChar(const Node *node, uint8_t key_char) const; diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h index f645632..ce75a82 100644 --- a/icing/legacy/index/icing-filesystem.h +++ b/icing/legacy/index/icing-filesystem.h @@ -224,6 +224,11 @@ class IcingFilesystem { // Increments to_increment by size if size is valid, or sets to_increment // to kBadFileSize if either size or to_increment is kBadFileSize. static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment); + + // Return -1 if file_size is invalid. Otherwise, return file_size. + static int64_t SanitizeFileSize(int64_t file_size) { + return (file_size != kBadFileSize) ? file_size : -1; + } }; } // namespace lib diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h index e3ba0e2..6bb9591 100644 --- a/icing/legacy/index/icing-flash-bitmap.h +++ b/icing/legacy/index/icing-flash-bitmap.h @@ -138,6 +138,7 @@ class IcingFlashBitmap { // Upgrade for version 18. bool UpgradeTo18(); + // Legacy file system. Switch to use the new Filesystem class instead. const IcingFilesystem *const filesystem_; std::string filename_; OpenType open_type_; diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index bdd40aa..e48fe78 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -16,7 +16,6 @@ #include "gmock/gmock.h" #include "third_party/absl/flags/flag.h" #include "icing/document-builder.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index.h" #include "icing/proto/term.pb.h" #include "icing/query/query-processor.h" @@ -24,6 +23,7 @@ #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index ef12b10..950f739 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -23,7 +23,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" @@ -40,6 +39,7 @@ #include "icing/store/document-store.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc index 2f22a47..ba4c90a 100644 --- a/icing/query/suggestion-processor_test.cc +++ b/icing/query/suggestion-processor_test.cc @@ -15,11 +15,11 @@ #include "icing/query/suggestion-processor.h" #include "gmock/gmock.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/store/document-store.h" #include "icing/testing/always-true-namespace-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 2695433..0d812e4 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -22,7 +22,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -36,6 +35,7 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) { TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) { MockFilesystem mock_filesystem; - ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false)); - + ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>())) + .WillByDefault(Return(false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_, diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index 6af7017..bd1524e 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -78,26 +78,25 @@ inline std::string AddIndexToPath(int values_size, int index, // Returns a string of the normalized text of the input Token. Normalization // is applied based on the Token's type. -std::string NormalizeToken(const Normalizer& normalizer, - const Token& token) { +std::string NormalizeToken(const Normalizer& normalizer, const Token& token) { switch (token.type) { - case Token::REGULAR: + case Token::Type::REGULAR: return normalizer.NormalizeTerm(token.text); - case Token::VERBATIM: + case Token::Type::VERBATIM: return std::string(token.text); - case Token::QUERY_EXCLUSION: + case Token::Type::QUERY_EXCLUSION: [[fallthrough]]; - case Token::QUERY_LEFT_PARENTHESES: + case Token::Type::QUERY_LEFT_PARENTHESES: [[fallthrough]]; - case Token::QUERY_RIGHT_PARENTHESES: + case Token::Type::QUERY_RIGHT_PARENTHESES: [[fallthrough]]; - case Token::QUERY_OR: + case Token::Type::QUERY_OR: [[fallthrough]]; - case Token::QUERY_PROPERTY: + case Token::Type::QUERY_PROPERTY: [[fallthrough]]; - case Token::INVALID: + case Token::Type::INVALID: ICING_LOG(WARNING) << "Unable to normalize token of type: " - << token.type; + << static_cast<int>(token.type); return std::string(token.text); } } @@ -107,7 +106,7 @@ std::string NormalizeToken(const Normalizer& normalizer, CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token, const std::string& match_query_term) { switch (token.type) { - case Token::VERBATIM: { + case Token::Type::VERBATIM: { // VERBATIM tokens are not normalized. This means the non-normalized // matched query term must be either equal to or a prefix of the token's // text. Therefore, the match must end at the end of the matched query @@ -117,22 +116,22 @@ CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token, verbatim_match_end.AdvanceToUtf8(match_query_term.length()); return verbatim_match_end; } - case Token::QUERY_EXCLUSION: + case Token::Type::QUERY_EXCLUSION: [[fallthrough]]; - case Token::QUERY_LEFT_PARENTHESES: + case Token::Type::QUERY_LEFT_PARENTHESES: [[fallthrough]]; - case Token::QUERY_RIGHT_PARENTHESES: + case Token::Type::QUERY_RIGHT_PARENTHESES: [[fallthrough]]; - case Token::QUERY_OR: + case Token::Type::QUERY_OR: [[fallthrough]]; - case Token::QUERY_PROPERTY: + case Token::Type::QUERY_PROPERTY: [[fallthrough]]; - case Token::INVALID: + case Token::Type::INVALID: ICING_LOG(WARNING) - << "Unexpected Token type " << token.type + << "Unexpected Token type " << static_cast<int>(token.type) << " found when finding match end of query term and token."; [[fallthrough]]; - case Token::REGULAR: + case Token::Type::REGULAR: return normalizer.FindNormalizedMatchEndPosition(token.text, match_query_term); } diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index 365eba2..0de2295 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -22,7 +22,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -37,6 +36,7 @@ #include "icing/store/key-mapper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index 67528ab..b5c976f 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -268,7 +268,7 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) { libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). schema_type_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete( filesystem_, MakeSchemaTypeMapperFilename(base_dir_)); @@ -464,11 +464,8 @@ libtextclassifier3::Status SchemaStore::PersistToDisk() { SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const { SchemaStoreStorageInfoProto storage_info; int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str()); - if (directory_size != Filesystem::kBadFileSize) { - storage_info.set_schema_store_size(directory_size); - } else { - storage_info.set_schema_store_size(-1); - } + storage_info.set_schema_store_size( + Filesystem::SanitizeFileSize(directory_size)); ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info); storage_info.set_num_schema_types(schema->types_size()); int total_sections = 0; diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc index 5e0426e..5e23a8e 100644 --- a/icing/store/document-log-creator.cc +++ b/icing/store/document-log-creator.cc @@ -72,19 +72,20 @@ DocumentLogCreator::Create(const Filesystem* filesystem, bool v1_exists = filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str()); - bool regen_derived_files = false; + bool new_file = false; + int preexisting_file_version = kCurrentVersion; if (v0_exists && !v1_exists) { ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir)); // Need to regenerate derived files since documents may be written to a // different file offset in the log. - regen_derived_files = true; + preexisting_file_version = 0; } else if (!v1_exists) { // First time initializing a v1 log. There are no existing derived files at // this point, so we should generate some. "regenerate" here also means // "generate for the first time", i.e. we shouldn't expect there to be any // existing derived files. - regen_derived_files = true; + new_file = true; } ICING_ASSIGN_OR_RETURN( @@ -96,7 +97,7 @@ DocumentLogCreator::Create(const Filesystem* filesystem, /*compress_in=*/true))); CreateResult create_result = {std::move(log_create_result), - regen_derived_files}; + preexisting_file_version, new_file}; return create_result; } diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h index 51cf497..be8feed 100644 --- a/icing/store/document-log-creator.h +++ b/icing/store/document-log-creator.h @@ -30,14 +30,20 @@ namespace lib { // be necessary. class DocumentLogCreator { public: + // Version 0 refers to FileBackedProtoLog + // Version 1 refers to PortableFileBackedProtoLog with kFileFormatVersion = 0 + static constexpr int32_t kCurrentVersion = 1; struct CreateResult { // The create result passed up from the PortableFileBackedProtoLog::Create. // Contains the document log. PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result; - // Whether the caller needs to also regenerate/generate any derived files - // based off of the initialized document log. - bool regen_derived_files; + // The version number of the pre-existing document log file. + // If there is no document log file, it will be set to kCurrentVersion. + int preexisting_file_version; + + // Whether the created file is new. + bool new_file; }; // Creates the document log in the base_dir. Will create one if it doesn't diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 226a96b..a2ae5f9 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -164,6 +164,32 @@ int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms, return expiration_timestamp_ms; } +InitializeStatsProto::RecoveryCause GetRecoveryCause( + const DocumentLogCreator::CreateResult& create_result, + bool force_recovery_and_revalidate_documents) { + if (force_recovery_and_revalidate_documents) { + return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC; + } else if (create_result.log_create_result.has_data_loss()) { + return InitializeStatsProto::DATA_LOSS; + } else if (create_result.preexisting_file_version != + DocumentLogCreator::kCurrentVersion) { + return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT; + } + return InitializeStatsProto::NONE; +} + +InitializeStatsProto::DocumentStoreDataStatus GetDataStatus( + DataLoss data_loss) { + switch (data_loss) { + case DataLoss::PARTIAL: + return InitializeStatsProto::PARTIAL_LOSS; + case DataLoss::COMPLETE: + return InitializeStatsProto::COMPLETE_LOSS; + case DataLoss::NONE: + return InitializeStatsProto::NO_DATA_LOSS; + } +} + } // namespace DocumentStore::DocumentStore(const Filesystem* filesystem, @@ -236,44 +262,34 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( std::move(create_result_or).ValueOrDie(); document_log_ = std::move(create_result.log_create_result.proto_log); - - if (create_result.regen_derived_files || - force_recovery_and_revalidate_documents || - create_result.log_create_result.has_data_loss()) { + InitializeStatsProto::RecoveryCause recovery_cause = + GetRecoveryCause(create_result, force_recovery_and_revalidate_documents); + + if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) { + ICING_LOG(WARNING) << "Starting Document Store Recovery with cause=" + << recovery_cause << ", and create result { new_file=" + << create_result.new_file << ", preeisting_file_version=" + << create_result.preexisting_file_version << ", data_loss=" + << create_result.log_create_result.data_loss << "} and kCurrentVersion=" + << DocumentLogCreator::kCurrentVersion; // We can't rely on any existing derived files. Recreate them from scratch. // Currently happens if: // 1) This is a new log and we don't have derived files yet // 2) Client wanted us to force a regeneration. // 3) Log has some data loss, can't rely on existing derived data. - if (create_result.log_create_result.has_data_loss() && - initialize_stats != nullptr) { - ICING_LOG(WARNING) - << "Data loss in document log, regenerating derived files."; - initialize_stats->set_document_store_recovery_cause( - InitializeStatsProto::DATA_LOSS); - - if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) { - // Ground truth is partially lost. - initialize_stats->set_document_store_data_status( - InitializeStatsProto::PARTIAL_LOSS); - } else { - // Ground truth is completely lost. - initialize_stats->set_document_store_data_status( - InitializeStatsProto::COMPLETE_LOSS); - } - } - std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); libtextclassifier3::Status status = RegenerateDerivedFiles(force_recovery_and_revalidate_documents); if (initialize_stats != nullptr && - (force_recovery_and_revalidate_documents || - create_result.log_create_result.has_data_loss())) { + recovery_cause != InitializeStatsProto::NONE) { // Only consider it a recovery if the client forced a recovery or there // was data loss. Otherwise, this could just be the first time we're // initializing and generating derived files. initialize_stats->set_document_store_recovery_latency_ms( document_recovery_timer->GetElapsedMilliseconds()); + initialize_stats->set_document_store_recovery_cause(recovery_cause); + initialize_stats->set_document_store_data_status( + GetDataStatus(create_result.log_create_result.data_loss)); } if (!status.ok()) { ICING_LOG(ERROR) @@ -282,13 +298,13 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( } } else { if (!InitializeExistingDerivedFiles().ok()) { - ICING_VLOG(1) + ICING_LOG(WARNING) << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for DocumentStore."; std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); libtextclassifier3::Status status = RegenerateDerivedFiles( - /*force_recovery_and_revalidate_documents*/ false); - if (initialize_stats != nullptr && num_documents() > 0) { + /*force_recovery_and_revalidate_documents=*/false); + if (initialize_stats != nullptr) { initialize_stats->set_document_store_recovery_cause( InitializeStatsProto::IO_ERROR); initialize_stats->set_document_store_recovery_latency_ms( @@ -530,7 +546,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). document_key_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_); @@ -540,7 +556,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { return status; } - // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN + // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. auto document_key_mapper_or = KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize); @@ -556,7 +572,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). document_id_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete( *filesystem_, MakeDocumentIdMapperFilename(base_dir_)); @@ -565,7 +581,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() { << "Failed to delete old document_id mapper"; return status; } - // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN + // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. auto document_id_mapper_or = FileBackedVector<int64_t>::Create( *filesystem_, MakeDocumentIdMapperFilename(base_dir_), @@ -618,7 +634,7 @@ libtextclassifier3::Status DocumentStore::ResetFilterCache() { libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). namespace_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete( *filesystem_, MakeNamespaceMapperFilename(base_dir_)); @@ -638,7 +654,7 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() { libtextclassifier3::Status DocumentStore::ResetCorpusMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). corpus_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete( *filesystem_, MakeCorpusMapperFilename(base_dir_)); @@ -1749,5 +1765,63 @@ libtextclassifier3::Status DocumentStore::SetUsageScores( return usage_store_->SetUsageScores(document_id, usage_scores); } +libtextclassifier3::StatusOr< + google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> +DocumentStore::CollectCorpusInfo() const { + google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> + corpus_info; + libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or = + schema_store_->GetSchema(); + if (!schema_proto_or.ok()) { + return corpus_info; + } + // Maps from CorpusId to the corresponding protocol buffer in the result. + std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map; + std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace = + namespace_mapper_->GetValuesToKeys(); + const SchemaProto* schema_proto = schema_proto_or.ValueOrDie(); + for (DocumentId document_id = 0; document_id < filter_cache_->num_elements(); + ++document_id) { + if (!InternalDoesDocumentExist(document_id)) { + continue; + } + ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data, + filter_cache_->Get(document_id)); + ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data, + score_cache_->Get(document_id)); + const std::string& name_space = + namespace_id_to_namespace[filter_data->namespace_id()]; + const std::string& schema = + schema_proto->types()[filter_data->schema_type_id()].schema_type(); + auto iter = info_map.find(score_data->corpus_id()); + if (iter == info_map.end()) { + DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add(); + entry->set_namespace_(name_space); + entry->set_schema(schema); + iter = info_map.insert({score_data->corpus_id(), entry}).first; + } + iter->second->set_total_documents(iter->second->total_documents() + 1); + iter->second->set_total_token(iter->second->total_token() + + score_data->length_in_tokens()); + } + return corpus_info; +} + +libtextclassifier3::StatusOr<DocumentDebugInfoProto> +DocumentStore::GetDebugInfo(int verbosity) const { + DocumentDebugInfoProto debug_info; + *debug_info.mutable_document_storage_info() = GetStorageInfo(); + ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum()); + debug_info.set_crc(crc.Get()); + if (verbosity > 0) { + ICING_ASSIGN_OR_RETURN(google::protobuf::RepeatedPtrField< + DocumentDebugInfoProto::CorpusInfo> + corpus_info, + CollectCorpusInfo()); + *debug_info.mutable_corpus_info() = std::move(corpus_info); + } + return debug_info; +} + } // namespace lib } // namespace icing diff --git a/icing/store/document-store.h b/icing/store/document-store.h index c85c989..e6d2e5c 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -27,6 +27,7 @@ #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/portable-file-backed-proto-log.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/logging.pb.h" @@ -422,6 +423,17 @@ class DocumentStore { // INTERNAL_ERROR on compute error libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; + // Get debug information for the document store. + // verbosity <= 0, simplest debug information + // verbosity > 0, also return the total number of documents and tokens in each + // (namespace, schema type) pair. + // + // Returns: + // DocumentDebugInfoProto on success + // INTERNAL_ERROR on IO errors, crc compute error + libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo( + int verbosity) const; + private: // Use DocumentStore::Create() to instantiate. DocumentStore(const Filesystem* filesystem, std::string_view base_dir, @@ -696,6 +708,13 @@ class DocumentStore { // the document_id_mapper somehow became larger than the filter cache. DocumentStorageInfoProto CalculateDocumentStatusCounts( DocumentStorageInfoProto storage_info) const; + + // Returns: + // - on success, a RepeatedPtrField for CorpusInfo collected. + // - OUT_OF_RANGE, this should never happen. + libtextclassifier3::StatusOr<google::protobuf::RepeatedPtrField< + DocumentDebugInfoProto::CorpusInfo>> + CollectCorpusInfo() const; }; } // namespace lib diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index 78d2f9c..96d11bf 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -29,7 +29,6 @@ #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -45,6 +44,7 @@ #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -3170,15 +3170,6 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); } -// TODO(b/185845269) Re-enable this test by copying over a full valid set of -// document store files. Right now this test only includes the score_cache and -// the document store header. -// -// This causes a problem now because this cl changes behavior to not consider an -// InitializeExistingDerivedFiles failure to be a recovery if there is nothing -// to recover because the doocument store is empty. -#define DISABLE_BACKWARDS_COMPAT_TEST -#ifndef DISABLE_BACKWARDS_COMPAT_TEST TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // The directory testdata/score_cache_without_length_in_tokens/document_store // contains only the scoring_cache and the document_store_header (holding the @@ -3194,29 +3185,26 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // Get src files std::string document_store_without_length_in_tokens; - if (IsAndroidPlatform() || IsIosPlatform()) { + if (IsAndroidArm() || IsIosPlatform()) { document_store_without_length_in_tokens = GetTestFilePath( "icing/testdata/score_cache_without_length_in_tokens/" "document_store_android_ios_compatible"); + } else if (IsAndroidX86()) { + document_store_without_length_in_tokens = GetTestFilePath( + "icing/testdata/score_cache_without_length_in_tokens/" + "document_store_android_x86"); } else { document_store_without_length_in_tokens = GetTestFilePath( "icing/testdata/score_cache_without_length_in_tokens/" "document_store"); } - std::vector<std::string> document_store_files; Filesystem filesystem; - filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(), - &document_store_files); - - ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens - << ' ' << document_store_files.size(); - for (size_t i = 0; i != document_store_files.size(); i++) { - std::string src = absl_ports::StrCat( - document_store_without_length_in_tokens, "/", document_store_files[i]); - std::string dst = - absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]); - ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true); - } + ICING_LOG(INFO) << "Copying files " + << document_store_without_length_in_tokens; + ASSERT_THAT( + filesystem.CopyDirectory(document_store_without_length_in_tokens.c_str(), + document_store_dir_.c_str(), /*recursive=*/true), + true); InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( @@ -3227,12 +3215,11 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - // The store_cache trigger regeneration because its element size is - // inconsistent: expected 20 (current new size), actual 12 (as per the v0 - // score_cache). - EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause()); + // The document log is using the legacy v0 format so that a migration is + // needed, which will also trigger regeneration. + EXPECT_EQ(initialize_stats.document_store_recovery_cause(), + InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); } -#endif // DISABLE_BACKWARDS_COMPAT_TEST TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) { ICING_ASSERT_OK_AND_ASSIGN( @@ -3422,18 +3409,22 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { { // Create the document store the second time and force recovery + InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create( - &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true)); + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true, + &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); // Ensure that the type id of the email document has been correctly updated. ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data, doc_store->GetDocumentFilterData(docid)); - ASSERT_THAT(filter_data.schema_type_id(), Eq(1)); + EXPECT_THAT(filter_data.schema_type_id(), Eq(1)); + EXPECT_THAT(initialize_stats.document_store_recovery_cause(), + Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC)); } } @@ -3840,7 +3831,8 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { // Check that we didn't lose anything. A migration also doesn't technically // count as a recovery. EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); - EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause()); + EXPECT_EQ(initialize_stats.document_store_recovery_cause(), + InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); // Document 1 and 3 were put normally, and document 2 was deleted in our // testdata files. @@ -3862,6 +3854,164 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { IsOkAndHolds(EqualsProto(document3))); } +TEST_F(DocumentStoreTest, GetDebugInfo) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + std::string schema_store_dir = schema_store_dir_ + "_custom"; + filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); + + ICING_ASSERT_OK(schema_store->SetSchema(schema)); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "aa bb cc") + .AddStringProperty("body", "dd ee") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK(document_store->Put(document1, 5)); + + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "email/2") + .SetSchema("email") + .AddStringProperty("subject", "aa bb") + .AddStringProperty("body", "cc") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK(document_store->Put(document2, 3)); + + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "email/3") + .SetSchema("email") + .AddStringProperty("subject", "aa") + .AddStringProperty("body", "") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK(document_store->Put(document3, 1)); + + DocumentProto document4 = DocumentBuilder() + .SetKey("namespace1", "person/1") + .SetSchema("person") + .AddStringProperty("name", "test test") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK(document_store->Put(document4, 2)); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out1, + document_store->GetDebugInfo(/*verbosity=*/1)); + EXPECT_THAT(out1.crc(), Gt(0)); + EXPECT_THAT(out1.document_storage_info().num_alive_documents(), Eq(4)); + EXPECT_THAT(out1.document_storage_info().num_deleted_documents(), Eq(0)); + EXPECT_THAT(out1.document_storage_info().num_expired_documents(), Eq(0)); + + DocumentDebugInfoProto::CorpusInfo info1, info2, info3; + info1.set_namespace_("namespace1"); + info1.set_schema("email"); + info1.set_total_documents(1); // document1 + info1.set_total_token(5); + + info2.set_namespace_("namespace2"); + info2.set_schema("email"); + info2.set_total_documents(2); // document2 and document3 + info2.set_total_token(4); // 3 + 1 + + info3.set_namespace_("namespace1"); + info3.set_schema("person"); + info3.set_total_documents(1); // document4 + info3.set_total_token(2); + + EXPECT_THAT(out1.corpus_info(), + UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2), + EqualsProto(info3))); + + // Delete document3. + ICING_ASSERT_OK(document_store->Delete("namespace2", "email/3")); + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out2, + document_store->GetDebugInfo(/*verbosity=*/1)); + EXPECT_THAT(out2.crc(), Gt(0)); + EXPECT_THAT(out2.crc(), Not(Eq(out1.crc()))); + EXPECT_THAT(out2.document_storage_info().num_alive_documents(), Eq(3)); + EXPECT_THAT(out2.document_storage_info().num_deleted_documents(), Eq(1)); + EXPECT_THAT(out2.document_storage_info().num_expired_documents(), Eq(0)); + info2.set_total_documents(1); // document2 + info2.set_total_token(3); + EXPECT_THAT(out2.corpus_info(), + UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2), + EqualsProto(info3))); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out3, + document_store->GetDebugInfo(/*verbosity=*/0)); + EXPECT_THAT(out3.corpus_info(), IsEmpty()); +} + +TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) { + std::string schema_store_dir = schema_store_dir_ + "_custom"; + filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out, + document_store->GetDebugInfo(/*verbosity=*/1)); + EXPECT_THAT(out.crc(), Gt(0)); + EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0)); + EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0)); + EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0)); + EXPECT_THAT(out.corpus_info(), IsEmpty()); +} + +TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out, + document_store->GetDebugInfo(/*verbosity=*/1)); + EXPECT_THAT(out.crc(), Gt(0)); + EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0)); + EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0)); + EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0)); + EXPECT_THAT(out.corpus_info(), IsEmpty()); +} + } // namespace } // namespace lib diff --git a/icing/helpers/icu/icu-data-file-helper.cc b/icing/testing/icu-data-file-helper.cc index 6607c40..aaeb738 100644 --- a/icing/helpers/icu/icu-data-file-helper.cc +++ b/icing/testing/icu-data-file-helper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/helpers/icu/icu-data-file-helper.h" +#include "icing/testing/icu-data-file-helper.h" #include <sys/mman.h> diff --git a/icing/helpers/icu/icu-data-file-helper.h b/icing/testing/icu-data-file-helper.h index 90f5bc7..d0276e7 100644 --- a/icing/helpers/icu/icu-data-file-helper.h +++ b/icing/testing/icu-data-file-helper.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER -#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER +#ifndef ICING_TESTING_ICU_DATA_FILE_HELPER +#define ICING_TESTING_ICU_DATA_FILE_HELPER #include "icing/text_classifier/lib3/utils/base/status.h" @@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile( } // namespace lib } // namespace icing -#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER +#endif // ICING_TESTING_ICU_DATA_FILE_HELPER diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h index 3165bf6..fd8d87b 100644 --- a/icing/testing/random-string.h +++ b/icing/testing/random-string.h @@ -15,6 +15,7 @@ #ifndef ICING_TESTING_RANDOM_STRING_H_ #define ICING_TESTING_RANDOM_STRING_H_ +#include <algorithm> #include <random> #include <string> diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index 598ede7..8e0f789 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -59,34 +59,35 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { ~IcuLanguageSegmenterIterator() { ubrk_close(break_iterator_); - utext_close(&u_text_); + utext_close(u_text_); } // Advances to the next term. Returns false if it has reached the end. bool Advance() override { - // Prerequisite check - if (term_end_index_exclusive_ == UBRK_DONE) { - return false; - } + while (true) { + // Prerequisite check + if (term_end_index_exclusive_ == UBRK_DONE) { + return false; + } - if (term_end_index_exclusive_ == 0) { - // First Advance() call - term_start_index_ = ubrk_first(break_iterator_); - } else { - term_start_index_ = term_end_index_exclusive_; - } - term_end_index_exclusive_ = ubrk_next(break_iterator_); + if (term_end_index_exclusive_ == 0) { + // First Advance() call + term_start_index_ = ubrk_first(break_iterator_); + } else { + term_start_index_ = term_end_index_exclusive_; + } + term_end_index_exclusive_ = ubrk_next(break_iterator_); - // Reached the end - if (term_end_index_exclusive_ == UBRK_DONE) { - MarkAsDone(); - return false; - } + // Reached the end + if (term_end_index_exclusive_ == UBRK_DONE) { + MarkAsDone(); + return false; + } - if (!IsValidSegment()) { - return Advance(); + if (IsValidSegment()) { + return true; + } } - return true; } // Returns the current term. It can be called only when Advance() returns @@ -253,7 +254,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { : break_iterator_(nullptr), text_(text), locale_(locale), - u_text_(UTEXT_INITIALIZER), + u_text_(nullptr), offset_iterator_(text), term_start_index_(0), term_end_index_exclusive_(0) {} @@ -261,10 +262,13 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Returns true on success bool Initialize() { UErrorCode status = U_ZERO_ERROR; - utext_openUTF8(&u_text_, text_.data(), text_.length(), &status); + u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status); + if (u_text_ == nullptr) { + return false; + } break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr, /*textLength=*/0, &status); - ubrk_setUText(break_iterator_, &u_text_, &status); + ubrk_setUText(break_iterator_, u_text_, &status); return !U_FAILURE(status); } @@ -322,8 +326,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { std::string_view locale_; // A thin wrapper around the input UTF8 text, needed by break_iterator_. - // utext_close() must be called after using. - UText u_text_; + // Allocated by calling utext_openUtf8() and freed by calling utext_close(). + UText* u_text_; // Offset iterator. This iterator is not guaranteed to point to any particular // character, but is guaranteed to point to a valid UTF character sequence. diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index 3090087..fe0b96e 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -21,8 +21,8 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc index d293581..3aff45c 100644 --- a/icing/tokenization/language-segmenter-iterator_test.cc +++ b/icing/tokenization/language-segmenter-iterator_test.cc @@ -15,9 +15,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc index bd86169..6f7d4df 100644 --- a/icing/tokenization/language-segmenter_benchmark.cc +++ b/icing/tokenization/language-segmenter_benchmark.cc @@ -14,8 +14,8 @@ #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc index df98bee..7a1949f 100644 --- a/icing/tokenization/plain-tokenizer.cc +++ b/icing/tokenization/plain-tokenizer.cc @@ -66,9 +66,9 @@ class PlainTokenIterator : public Tokenizer::Iterator { Token GetToken() const override { if (current_term_.empty()) { - return Token(Token::INVALID); + return Token(Token::Type::INVALID); } - return Token(Token::REGULAR, current_term_); + return Token(Token::Type::REGULAR, current_term_); } libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart() diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc index f76ba9f..c48b51e 100644 --- a/icing/tokenization/plain-tokenizer_test.cc +++ b/icing/tokenization/plain-tokenizer_test.cc @@ -18,9 +18,9 @@ #include "gmock/gmock.h" #include "icing/absl_ports/str_cat.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" @@ -68,26 +68,27 @@ TEST_F(PlainTokenizerTest, Simple) { EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("Hello World"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); EXPECT_THAT( plain_tokenizer->TokenizeAll( "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " "Duis efficitur iaculis auctor."), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"), - EqualsToken(Token::REGULAR, "ipsum"), - EqualsToken(Token::REGULAR, "dolor"), - EqualsToken(Token::REGULAR, "sit"), - EqualsToken(Token::REGULAR, "amet"), - EqualsToken(Token::REGULAR, "consectetur"), - EqualsToken(Token::REGULAR, "adipiscing"), - EqualsToken(Token::REGULAR, "elit"), - EqualsToken(Token::REGULAR, "Duis"), - EqualsToken(Token::REGULAR, "efficitur"), - EqualsToken(Token::REGULAR, "iaculis"), - EqualsToken(Token::REGULAR, "auctor")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Lorem"), + EqualsToken(Token::Type::REGULAR, "ipsum"), + EqualsToken(Token::Type::REGULAR, "dolor"), + EqualsToken(Token::Type::REGULAR, "sit"), + EqualsToken(Token::Type::REGULAR, "amet"), + EqualsToken(Token::Type::REGULAR, "consectetur"), + EqualsToken(Token::Type::REGULAR, "adipiscing"), + EqualsToken(Token::Type::REGULAR, "elit"), + EqualsToken(Token::Type::REGULAR, "Duis"), + EqualsToken(Token::Type::REGULAR, "efficitur"), + EqualsToken(Token::Type::REGULAR, "iaculis"), + EqualsToken(Token::Type::REGULAR, "auctor")))); } TEST_F(PlainTokenizerTest, Whitespace) { @@ -107,16 +108,18 @@ TEST_F(PlainTokenizerTest, Whitespace) { // 0x0009 is horizontal tab, considered as a whitespace std::string text_with_horizontal_tab = absl_ports::StrCat("Hello", UCharToString(0x0009), "World"); - EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll(text_with_horizontal_tab), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); // 0x000B is vertical tab, considered as a whitespace std::string text_with_vertical_tab = absl_ports::StrCat("Hello", UCharToString(0x000B), "World"); - EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll(text_with_vertical_tab), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); } TEST_F(PlainTokenizerTest, Punctuation) { @@ -131,38 +134,39 @@ TEST_F(PlainTokenizerTest, Punctuation) { language_segmenter.get())); // Half-width punctuation marks are filtered out. - EXPECT_THAT(plain_tokenizer->TokenizeAll( - "Hello, World! Hello: World. \"Hello\" World?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World"), - EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World"), - EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll( + "Hello, World! Hello: World. \"Hello\" World?"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World"), + EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World"), + EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); // Full-width punctuation marks are filtered out. std::vector<std::string_view> exp_tokens; if (IsCfStringTokenization()) { EXPECT_THAT( plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你"), - EqualsToken(Token::REGULAR, "好"), - EqualsToken(Token::REGULAR, "世界"), - EqualsToken(Token::REGULAR, "你"), - EqualsToken(Token::REGULAR, "好"), - EqualsToken(Token::REGULAR, "世界"), - EqualsToken(Token::REGULAR, "你"), - EqualsToken(Token::REGULAR, "好"), - EqualsToken(Token::REGULAR, "世界")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你"), + EqualsToken(Token::Type::REGULAR, "好"), + EqualsToken(Token::Type::REGULAR, "世界"), + EqualsToken(Token::Type::REGULAR, "你"), + EqualsToken(Token::Type::REGULAR, "好"), + EqualsToken(Token::Type::REGULAR, "世界"), + EqualsToken(Token::Type::REGULAR, "你"), + EqualsToken(Token::Type::REGULAR, "好"), + EqualsToken(Token::Type::REGULAR, "世界")))); } else { EXPECT_THAT( plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"), - EqualsToken(Token::REGULAR, "世界"), - EqualsToken(Token::REGULAR, "你好"), - EqualsToken(Token::REGULAR, "世界"), - EqualsToken(Token::REGULAR, "你好"), - EqualsToken(Token::REGULAR, "世界")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你好"), + EqualsToken(Token::Type::REGULAR, "世界"), + EqualsToken(Token::Type::REGULAR, "你好"), + EqualsToken(Token::Type::REGULAR, "世界"), + EqualsToken(Token::Type::REGULAR, "你好"), + EqualsToken(Token::Type::REGULAR, "世界")))); } } @@ -180,14 +184,16 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) { // Right now we don't have special logic for these characters, just output // them as tokens. - EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"), - EqualsToken(Token::REGULAR, "+"), - EqualsToken(Token::REGULAR, "1")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("1+1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "1"), + EqualsToken(Token::Type::REGULAR, "+"), + EqualsToken(Token::Type::REGULAR, "1")))); - EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"), - EqualsToken(Token::REGULAR, "50")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("$50"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "$"), + EqualsToken(Token::Type::REGULAR, "50")))); } TEST_F(PlainTokenizerTest, CJKT) { @@ -203,12 +209,13 @@ TEST_F(PlainTokenizerTest, CJKT) { tokenizer_factory::CreateIndexingTokenizer( StringIndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); - EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"), - EqualsToken(Token::REGULAR, "每天"), - EqualsToken(Token::REGULAR, "走路"), - EqualsToken(Token::REGULAR, "去"), - EqualsToken(Token::REGULAR, "上班")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("我每天走路去上班。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "我"), + EqualsToken(Token::Type::REGULAR, "每天"), + EqualsToken(Token::Type::REGULAR, "走路"), + EqualsToken(Token::Type::REGULAR, "去"), + EqualsToken(Token::Type::REGULAR, "上班")))); // Japanese options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE, jni_cache_.get()); @@ -220,41 +227,44 @@ TEST_F(PlainTokenizerTest, CJKT) { StringIndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); if (IsCfStringTokenization()) { - EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"), - EqualsToken(Token::REGULAR, "は"), - EqualsToken(Token::REGULAR, "毎日"), - EqualsToken(Token::REGULAR, "仕事"), - EqualsToken(Token::REGULAR, "に"), - EqualsToken(Token::REGULAR, "歩い"), - EqualsToken(Token::REGULAR, "て"), - EqualsToken(Token::REGULAR, "い"), - EqualsToken(Token::REGULAR, "ます")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"), + EqualsToken(Token::Type::REGULAR, "は"), + EqualsToken(Token::Type::REGULAR, "毎日"), + EqualsToken(Token::Type::REGULAR, "仕事"), + EqualsToken(Token::Type::REGULAR, "に"), + EqualsToken(Token::Type::REGULAR, "歩い"), + EqualsToken(Token::Type::REGULAR, "て"), + EqualsToken(Token::Type::REGULAR, "い"), + EqualsToken(Token::Type::REGULAR, "ます")))); } else { - EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"), - EqualsToken(Token::REGULAR, "は"), - EqualsToken(Token::REGULAR, "毎日"), - EqualsToken(Token::REGULAR, "仕事"), - EqualsToken(Token::REGULAR, "に"), - EqualsToken(Token::REGULAR, "歩"), - EqualsToken(Token::REGULAR, "い"), - EqualsToken(Token::REGULAR, "てい"), - EqualsToken(Token::REGULAR, "ます")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"), + EqualsToken(Token::Type::REGULAR, "は"), + EqualsToken(Token::Type::REGULAR, "毎日"), + EqualsToken(Token::Type::REGULAR, "仕事"), + EqualsToken(Token::Type::REGULAR, "に"), + EqualsToken(Token::Type::REGULAR, "歩"), + EqualsToken(Token::Type::REGULAR, "い"), + EqualsToken(Token::Type::REGULAR, "てい"), + EqualsToken(Token::Type::REGULAR, "ます")))); } // Khmer - EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"), - EqualsToken(Token::REGULAR, "ដើរទៅ"), - EqualsToken(Token::REGULAR, "ធ្វើការ"), - EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ")))); - // Korean EXPECT_THAT( - plain_tokenizer->TokenizeAll("나는 매일 출근합니다."), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"), - EqualsToken(Token::REGULAR, "매일"), - EqualsToken(Token::REGULAR, "출근합니다")))); + plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ញុំ"), + EqualsToken(Token::Type::REGULAR, "ដើរទៅ"), + EqualsToken(Token::Type::REGULAR, "ធ្វើការ"), + EqualsToken(Token::Type::REGULAR, "រាល់ថ្ងៃ")))); + // Korean + EXPECT_THAT(plain_tokenizer->TokenizeAll("나는 매일 출근합니다."), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::REGULAR, "나는"), + EqualsToken(Token::Type::REGULAR, "매일"), + EqualsToken(Token::Type::REGULAR, "출근합니다")))); // Thai // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups). @@ -264,19 +274,20 @@ TEST_F(PlainTokenizerTest, CJKT) { std::vector<Token> tokens, plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน")); - EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"), - EqualsToken(Token::REGULAR, "เดิน"), - EqualsToken(Token::REGULAR, "ไป"), - EqualsToken(Token::REGULAR, "ทำงาน"), - EqualsToken(Token::REGULAR, "ทุกวัน"))); + EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"), + EqualsToken(Token::Type::REGULAR, "เดิน"), + EqualsToken(Token::Type::REGULAR, "ไป"), + EqualsToken(Token::Type::REGULAR, "ทำงาน"), + EqualsToken(Token::Type::REGULAR, "ทุกวัน"))); } else { - EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"), - EqualsToken(Token::REGULAR, "เดิน"), - EqualsToken(Token::REGULAR, "ไป"), - EqualsToken(Token::REGULAR, "ทำงาน"), - EqualsToken(Token::REGULAR, "ทุก"), - EqualsToken(Token::REGULAR, "วัน")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"), + EqualsToken(Token::Type::REGULAR, "เดิน"), + EqualsToken(Token::Type::REGULAR, "ไป"), + EqualsToken(Token::Type::REGULAR, "ทำงาน"), + EqualsToken(Token::Type::REGULAR, "ทุก"), + EqualsToken(Token::Type::REGULAR, "วัน")))); } } @@ -295,7 +306,7 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) { auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0)); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b")); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "b")); EXPECT_FALSE(iterator->ResetToTokenStartingAfter(2)); } @@ -315,7 +326,7 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) { auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2)); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f")); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "f")); EXPECT_FALSE(iterator->ResetToTokenEndingBefore(0)); } @@ -332,11 +343,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) { language_segmenter.get())); constexpr std::string_view kText = " foo . bar baz.. bat "; - EXPECT_THAT(plain_tokenizer->TokenizeAll(kText), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"), - EqualsToken(Token::REGULAR, "bar"), - EqualsToken(Token::REGULAR, "baz"), - EqualsToken(Token::REGULAR, "bat")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll(kText), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"), + EqualsToken(Token::Type::REGULAR, "bar"), + EqualsToken(Token::Type::REGULAR, "baz"), + EqualsToken(Token::Type::REGULAR, "bat")))); std::vector<std::string> expected_text = { "foo", // 0: " foo . bar" "bar", // 1: "foo . bar " @@ -359,12 +371,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) { auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); EXPECT_TRUE(iterator->Advance()); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo")); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo")); for (int i = 0; i < kText.length(); ++i) { if (i < expected_text.size()) { EXPECT_TRUE(iterator->ResetToTokenStartingAfter(i)); EXPECT_THAT(iterator->GetToken(), - EqualsToken(Token::REGULAR, expected_text[i])); + EqualsToken(Token::Type::REGULAR, expected_text[i])); } else { EXPECT_FALSE(iterator->ResetToTokenStartingAfter(i)); } @@ -383,11 +395,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) { language_segmenter.get())); constexpr std::string_view kText = " foo . bar baz.. bat "; - EXPECT_THAT(plain_tokenizer->TokenizeAll(kText), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"), - EqualsToken(Token::REGULAR, "bar"), - EqualsToken(Token::REGULAR, "baz"), - EqualsToken(Token::REGULAR, "bat")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll(kText), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"), + EqualsToken(Token::Type::REGULAR, "bar"), + EqualsToken(Token::Type::REGULAR, "baz"), + EqualsToken(Token::Type::REGULAR, "bat")))); std::vector<std::string> expected_text = { "bat", // 20: "baz.. bat " "baz", // 19: " baz.. bat" @@ -410,13 +423,14 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) { auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); EXPECT_TRUE(iterator->Advance()); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo")); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo")); for (int i = kText.length() - 1; i >= 0; --i) { int expected_index = kText.length() - 1 - i; if (expected_index < expected_text.size()) { EXPECT_TRUE(iterator->ResetToTokenEndingBefore(i)); - EXPECT_THAT(iterator->GetToken(), - EqualsToken(Token::REGULAR, expected_text[expected_index])); + EXPECT_THAT( + iterator->GetToken(), + EqualsToken(Token::Type::REGULAR, expected_text[expected_index])); } else { EXPECT_FALSE(iterator->ResetToTokenEndingBefore(i)); } diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc index 2d461ee..8a27103 100644 --- a/icing/tokenization/raw-query-tokenizer.cc +++ b/icing/tokenization/raw-query-tokenizer.cc @@ -422,7 +422,7 @@ std::pair<TermType, std::string_view> GetTerm(std::string_view text, // and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no // valid token on its right. void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) { - if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) { + if (!tokens->empty() && tokens->back().type == Token::Type::QUERY_OR) { tokens->pop_back(); } } @@ -436,11 +436,11 @@ libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) { } Token::Type last_token_type = tokens->back().type; switch (last_token_type) { - case Token::REGULAR: - case Token::QUERY_RIGHT_PARENTHESES: - tokens->emplace_back(Token::QUERY_OR); + case Token::Type::REGULAR: + case Token::Type::QUERY_RIGHT_PARENTHESES: + tokens->emplace_back(Token::Type::QUERY_OR); break; - case Token::QUERY_OR: + case Token::Type::QUERY_OR: // Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2" break; default: @@ -481,21 +481,21 @@ libtextclassifier3::Status OutputToken(State new_state, GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME)); } } - tokens->emplace_back(Token::QUERY_PROPERTY, current_term); + tokens->emplace_back(Token::Type::QUERY_PROPERTY, current_term); } else { - tokens->emplace_back(Token::REGULAR, current_term); + tokens->emplace_back(Token::Type::REGULAR, current_term); } break; case LEFT_PARENTHESES: - tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES); + tokens->emplace_back(Token::Type::QUERY_LEFT_PARENTHESES); break; case RIGHT_PARENTHESES: // Ignores "OR" if it's followed by right parentheses. RemoveLastTokenIfOrOperator(tokens); - tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES); + tokens->emplace_back(Token::Type::QUERY_RIGHT_PARENTHESES); break; case EXCLUSION_OPERATOR: - tokens->emplace_back(Token::QUERY_EXCLUSION); + tokens->emplace_back(Token::Type::QUERY_EXCLUSION); break; case OR_OPERATOR: return OutputOrOperatorToken(tokens); @@ -648,7 +648,7 @@ class RawQueryTokenIterator : public Tokenizer::Iterator { Token GetToken() const override { if (current_ < 0 || current_ >= tokens_.size()) { - return Token(Token::INVALID); + return Token(Token::Type::INVALID); } return tokens_.at(current_); } diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index 500efa0..c6d981d 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -16,9 +16,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/tokenizer-factory.h" @@ -59,13 +59,15 @@ TEST_F(RawQueryTokenizerTest, Simple) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("Hello World!"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"), - EqualsToken(Token::REGULAR, "WORLD")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("hElLo WORLD"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "hElLo"), + EqualsToken(Token::Type::REGULAR, "WORLD")))); } TEST_F(RawQueryTokenizerTest, Parentheses) { @@ -80,82 +82,82 @@ TEST_F(RawQueryTokenizerTest, Parentheses) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term3"), - EqualsToken(Token::REGULAR, "term4"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term3"), + EqualsToken(Token::Type::REGULAR, "term4"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("(term1)term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); - - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("(term1)-term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term2")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)-term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("(term1)OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, @@ -180,44 +182,49 @@ TEST_F(RawQueryTokenizerTest, Exclustion) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("-term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Exclusion operator is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("- term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // Exclusion operator is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1- term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2")))); // Exclusion operator is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // First exclusion operator is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("--term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1")))); // First "-" is exclusion operator, second is not and will be discarded. // In other words, exclusion only applies to the term right after it. - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("-term1-term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, @@ -249,73 +256,75 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Colon is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll(":term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // Colon is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Colon is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1:"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // property name can be a path EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"), - IsOkAndHolds( - ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"), - EqualsToken(Token::REGULAR, "hello")))); + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "email.title"), + EqualsToken(Token::Type::REGULAR, "hello")))); // The first colon ":" triggers property restriction, the second colon is used // as a word connector per ICU's rule // (https://unicode.org/reports/tr29/#Word_Boundaries). - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property:foo:bar"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"), - EqualsToken(Token::REGULAR, "foo:bar")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property"), + EqualsToken(Token::Type::REGULAR, "foo:bar")))); // Property restriction only applies to the term right after it. // Note: "term1:term2" is not a term but 2 terms because word connectors // don't apply to numbers and alphabets. - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:term1:term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1:term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:今天:天气"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::REGULAR, "天气")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:今天:天气"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::REGULAR, "天气")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:term1-"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1-"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1")))); // Multiple continuous colons will still be recognized as a property // restriction operator - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1::term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1::term1"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("property1:(term1)"), @@ -345,105 +354,109 @@ TEST_F(RawQueryTokenizerTest, OR) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1 OR term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); // Two continuous "OR"s are treated as one - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("(term1) OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); + raw_query_tokenizer->TokenizeAll("term1 OR OR term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1) OR term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Only "OR" (all in uppercase) is the operator EXPECT_THAT( raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "or"), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::REGULAR, "Or"), - EqualsToken(Token::REGULAR, "term3"), - EqualsToken(Token::REGULAR, "oR"), - EqualsToken(Token::REGULAR, "term4")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "or"), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::REGULAR, "Or"), + EqualsToken(Token::Type::REGULAR, "term3"), + EqualsToken(Token::Type::REGULAR, "oR"), + EqualsToken(Token::Type::REGULAR, "term4")))); // "OR" is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("OR term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // "OR" is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1 OR"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("term1 OR-term2"), @@ -472,31 +485,31 @@ TEST_F(RawQueryTokenizerTest, CJKT) { if (IsCfStringTokenization()) { EXPECT_THAT( raw_query_tokenizer->TokenizeAll("-今天天气很好"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::REGULAR, "天气"), - EqualsToken(Token::REGULAR, "很"), - EqualsToken(Token::REGULAR, "好")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::REGULAR, "天气"), + EqualsToken(Token::Type::REGULAR, "很"), + EqualsToken(Token::Type::REGULAR, "好")))); } else { EXPECT_THAT( raw_query_tokenizer->TokenizeAll("-今天天气很好"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::REGULAR, "天气"), - EqualsToken(Token::REGULAR, "很好")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::REGULAR, "天气"), + EqualsToken(Token::Type::REGULAR, "很好")))); } if (IsCfStringTokenization()) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"), - IsOkAndHolds( - ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "你"), - EqualsToken(Token::REGULAR, "好")))); + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "你"), + EqualsToken(Token::Type::REGULAR, "好")))); } else { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"), - IsOkAndHolds( - ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "你好")))); + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "你好")))); } EXPECT_THAT( @@ -504,10 +517,11 @@ TEST_F(RawQueryTokenizerTest, CJKT) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("Characters in property name must all be ASCII"))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "ねこ")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("cat OR ねこ"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "cat"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "ねこ")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("cat ORねこ"), @@ -543,40 +557,45 @@ TEST_F(RawQueryTokenizerTest, OtherChars) { language_segmenter.get())); // Comma is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll(",term1, ,"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Exclusion operator and comma are ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("-,term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("-term1,"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1")))); // Colon and comma are ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"), - EqualsToken(Token::REGULAR, "term1")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:term1,term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2")))); + raw_query_tokenizer->TokenizeAll("property1:,term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "property1"), + EqualsToken(Token::Type::REGULAR, "term1")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1,term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2")))); // This is a special case for OR, unknown chars are treated the same as // whitespaces before and after OR. - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1,OR,term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); } TEST_F(RawQueryTokenizerTest, Mix) { @@ -593,37 +612,38 @@ TEST_F(RawQueryTokenizerTest, Mix) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll( "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::REGULAR, "こんにちは"), - EqualsToken(Token::REGULAR, "good"), - EqualsToken(Token::REGULAR, "afternoon"), - EqualsToken(Token::QUERY_PROPERTY, "title"), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "ใน"), - EqualsToken(Token::REGULAR, "วันนี้"), - EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "B12"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::REGULAR, "こんにちは"), + EqualsToken(Token::Type::REGULAR, "good"), + EqualsToken(Token::Type::REGULAR, "afternoon"), + EqualsToken(Token::Type::QUERY_PROPERTY, "title"), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "ใน"), + EqualsToken(Token::Type::REGULAR, "วันนี้"), + EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "B12"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); } else { ICING_ASSERT_OK_AND_ASSIGN( std::vector<Token> tokens, raw_query_tokenizer->TokenizeAll( "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)")); - EXPECT_THAT(tokens, - ElementsAre(EqualsToken(Token::REGULAR, "こんにちは"), - EqualsToken(Token::REGULAR, "good"), - EqualsToken(Token::REGULAR, "afternoon"), - EqualsToken(Token::QUERY_PROPERTY, "title"), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "ใน"), - EqualsToken(Token::REGULAR, "วัน"), - EqualsToken(Token::REGULAR, "นี้"), - EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "B12"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))); + EXPECT_THAT( + tokens, + ElementsAre(EqualsToken(Token::Type::REGULAR, "こんにちは"), + EqualsToken(Token::Type::REGULAR, "good"), + EqualsToken(Token::Type::REGULAR, "afternoon"), + EqualsToken(Token::Type::QUERY_PROPERTY, "title"), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "ใน"), + EqualsToken(Token::Type::REGULAR, "วัน"), + EqualsToken(Token::Type::REGULAR, "นี้"), + EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "B12"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); } } diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index b936f2b..cb474c6 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -43,45 +43,46 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Advances to the next term. Returns false if it has reached the end. bool Advance() override { - // Prerequisite check - if (IsDone()) { - return false; - } + while (true) { + // Prerequisite check + if (IsDone()) { + return false; + } - if (term_end_exclusive_.utf16_index() == 0) { - int first = break_iterator_->First(); - if (!term_start_.MoveToUtf16(first)) { - // First is guaranteed to succeed and return a position within bonds. So - // the only possible failure could be an invalid sequence. Mark as DONE - // and return. + if (term_end_exclusive_.utf16_index() == 0) { + int first = break_iterator_->First(); + if (!term_start_.MoveToUtf16(first)) { + // First is guaranteed to succeed and return a position within bonds. + // So the only possible failure could be an invalid sequence. Mark as + // DONE and return. + MarkAsDone(); + return false; + } + } else { + term_start_ = term_end_exclusive_; + } + + int next_utf16_index_exclusive = break_iterator_->Next(); + // Reached the end + if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) { + MarkAsDone(); + return false; + } + if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) { + // next_utf16_index_exclusive is guaranteed to be within bonds thanks to + // the check for kDone above. So the only possible failure could be an + // invalid sequence. Mark as DONE and return. MarkAsDone(); return false; } - } else { - term_start_ = term_end_exclusive_; - } - - int next_utf16_index_exclusive = break_iterator_->Next(); - // Reached the end - if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) { - MarkAsDone(); - return false; - } - if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) { - // next_utf16_index_exclusive is guaranteed to be within bonds thanks to - // the check for kDone above. So the only possible failure could be an - // invalid sequence. Mark as DONE and return. - MarkAsDone(); - return false; - } - // Check if the current term is valid. We consider any term valid if its - // first character is valid. If it's not valid, then we need to advance to - // the next term. - if (IsValidTerm()) { - return true; + // Check if the current term is valid. We consider any term valid if its + // first character is valid. If it's not valid, then we need to advance to + // the next term. + if (IsValidTerm()) { + return true; + } } - return Advance(); } // Returns the current term. It can be called only when Advance() returns diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h index b97ee0b..0c268be 100644 --- a/icing/tokenization/token.h +++ b/icing/tokenization/token.h @@ -21,7 +21,7 @@ namespace icing { namespace lib { struct Token { - enum Type { + enum class Type { // Common types REGULAR, // A token without special meanings, the value of it will be // indexed or searched directly diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc index 173486a..dc54696 100644 --- a/icing/tokenization/verbatim-tokenizer.cc +++ b/icing/tokenization/verbatim-tokenizer.cc @@ -37,10 +37,10 @@ class VerbatimTokenIterator : public Tokenizer::Iterator { Token GetToken() const override { if (term_.empty() || !has_advanced_to_end_) { - return Token(Token::INVALID); + return Token(Token::Type::INVALID); } - return Token(Token::VERBATIM, term_); + return Token(Token::Type::VERBATIM, term_); } libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart() diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc index b6bc5fa..e38c7aa 100644 --- a/icing/tokenization/verbatim-tokenizer_test.cc +++ b/icing/tokenization/verbatim-tokenizer_test.cc @@ -15,9 +15,9 @@ #include <string_view> #include "gmock/gmock.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -71,7 +71,7 @@ TEST_F(VerbatimTokenizerTest, Simple) { EXPECT_THAT( verbatim_tokenizer->TokenizeAll("foo bar"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::VERBATIM, "foo bar")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::VERBATIM, "foo bar")))); } TEST_F(VerbatimTokenizerTest, Punctuation) { @@ -80,9 +80,9 @@ TEST_F(VerbatimTokenizerTest, Punctuation) { StringIndexingConfig::TokenizerType::VERBATIM, language_segmenter_.get())); - EXPECT_THAT( - verbatim_tokenizer->TokenizeAll("Hello, world!"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::VERBATIM, "Hello, world!")))); + EXPECT_THAT(verbatim_tokenizer->TokenizeAll("Hello, world!"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::VERBATIM, "Hello, world!")))); } TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) { @@ -95,7 +95,8 @@ TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) { auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); // We should get an invalid token if we get the token before advancing. - EXPECT_THAT(token_iterator->GetToken(), EqualsToken(Token::INVALID, "")); + EXPECT_THAT(token_iterator->GetToken(), + EqualsToken(Token::Type::INVALID, "")); } TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) { @@ -111,13 +112,13 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) { // is larger than the final index (12) of the verbatim token. EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13)); EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::VERBATIM, "Hello, world!")); + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); // Ensure our cached character iterator propertly maintains the end of the // verbatim token. EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13)); EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::VERBATIM, "Hello, world!")); + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); // We should not be able to reset with an offset before or within // the verbatim token's utf-32 length. @@ -137,7 +138,7 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) { // Get token without resetting EXPECT_TRUE(token_iterator->Advance()); EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::VERBATIM, "Hello, world!")); + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); // We expect a sole verbatim token, so it's not possible to reset after the // start of the token. @@ -147,7 +148,7 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) { // negative. EXPECT_TRUE(token_iterator->ResetToTokenStartingAfter(-1)); EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::VERBATIM, "Hello, world!")); + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); } TEST_F(VerbatimTokenizerTest, ResetToStart) { @@ -162,12 +163,12 @@ TEST_F(VerbatimTokenizerTest, ResetToStart) { // Get token without resetting EXPECT_TRUE(token_iterator->Advance()); EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::VERBATIM, "Hello, world!")); + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); // Retrieve token again after resetting to start EXPECT_TRUE(token_iterator->ResetToStart()); EXPECT_THAT(token_iterator->GetToken(), - EqualsToken(Token::VERBATIM, "Hello, world!")); + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); } TEST_F(VerbatimTokenizerTest, CalculateTokenStart) { diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc index 8d09be2..fdd4c70 100644 --- a/icing/transform/icu/icu-normalizer_benchmark.cc +++ b/icing/transform/icu/icu-normalizer_benchmark.cc @@ -14,8 +14,8 @@ #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc index a46fcc7..143da17 100644 --- a/icing/transform/icu/icu-normalizer_test.cc +++ b/icing/transform/icu/icu-normalizer_test.cc @@ -16,8 +16,8 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/test-data.h" #include "icing/transform/normalizer-factory.h" diff --git a/proto/icing/proto/debug.proto b/proto/icing/proto/debug.proto new file mode 100644 index 0000000..3f07539 --- /dev/null +++ b/proto/icing/proto/debug.proto @@ -0,0 +1,112 @@ +// Copyright 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package icing.lib; + +import "icing/proto/status.proto"; +import "icing/proto/storage.proto"; + +option java_package = "com.google.android.icing.proto"; +option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + +// Next tag: 4 +message IndexDebugInfoProto { + // Storage information of the index. + optional IndexStorageInfoProto index_storage_info = 1; + + message MainIndexDebugInfoProto { + // Information about the main lexicon. + // TODO(b/222349894) Convert the string output to a protocol buffer instead. + optional string lexicon_info = 1; + + // Last added document id. + optional uint32 last_added_document_id = 2; + + // If verbosity > 0, return information about the posting list storage. + // TODO(b/222349894) Convert the string output to a protocol buffer instead. + optional string flash_index_storage_info = 3; + } + optional MainIndexDebugInfoProto main_index_info = 2; + + message LiteIndexDebugInfoProto { + // Current number of hits. + optional uint32 curr_size = 1; + + // The maximum possible number of hits. + optional uint32 hit_buffer_size = 2; + + // Last added document id. + optional uint32 last_added_document_id = 3; + + // The first position in the hit buffer that is not sorted yet, + // or curr_size if all hits are sorted. + optional uint32 searchable_end = 4; + + // The most recent checksum of the lite index, by calling + // LiteIndex::ComputeChecksum(). + optional uint32 index_crc = 5; + + // Information about the lite lexicon. + // TODO(b/222349894) Convert the string output to a protocol buffer instead. + optional string lexicon_info = 6; + } + optional LiteIndexDebugInfoProto lite_index_info = 3; +} + +// Next tag: 4 +message DocumentDebugInfoProto { + // Storage information of the document store. + optional DocumentStorageInfoProto document_storage_info = 1; + + // The most recent checksum of the document store, by calling + // DocumentStore::ComputeChecksum(). + optional uint32 crc = 2; + + message CorpusInfo { + optional string namespace = 1; + optional string schema = 2; + optional uint32 total_documents = 3; + optional uint32 total_token = 4; + } + + // If verbosity > 0, return the total number of documents and tokens in each + // (namespace, schema type) pair. + // Note that deleted and expired documents are skipped in the output. + repeated CorpusInfo corpus_info = 3; +} + +// Next tag: 3 +message DebugInfoProto { + // Debug information of the index. + optional IndexDebugInfoProto index_info = 1; + + // Debug information of the document store. + optional DocumentDebugInfoProto document_info = 2; +} + +// Next tag: 3 +message DebugInfoResultProto { + // Status code can be one of: + // OK + // FAILED_PRECONDITION + // + // See status.proto for more details. + optional StatusProto status = 1; + + // Debug information for Icing. + optional DebugInfoProto debug_info = 2; +} diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto index 2e8321b..1a501e7 100644 --- a/proto/icing/proto/document.proto +++ b/proto/icing/proto/document.proto @@ -209,7 +209,7 @@ message DeleteBySchemaTypeResultProto { } // Result of a call to IcingSearchEngine.DeleteByQuery -// Next tag: 4 +// Next tag: 5 message DeleteByQueryResultProto { // Status code can be one of: // OK @@ -226,5 +226,18 @@ message DeleteByQueryResultProto { // Stats for delete execution performance. optional DeleteByQueryStatsProto delete_by_query_stats = 3; + // Used by DeleteByQueryResultProto to return information about deleted + // documents. + message DocumentGroupInfo { + optional string namespace = 1; + optional string schema = 2; + repeated string uris = 3; + } + + // Additional return message that shows the uris of the deleted documents, if + // users set return_deleted_document_info to true. + // The result is grouped by the corresponding namespace and type. + repeated DocumentGroupInfo deleted_documents = 4; + reserved 2; } diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto index 2f1f271..0a7c4a6 100644 --- a/proto/icing/proto/logging.proto +++ b/proto/icing/proto/logging.proto @@ -46,6 +46,9 @@ message InitializeStatsProto { // Random I/O errors. IO_ERROR = 4; + + // The document log is using legacy format. + LEGACY_DOCUMENT_LOG_FORMAT = 5; } // Possible recovery causes for document store: diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 26469ab..83c7099 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=417867838) +set(synced_AOSP_CL_number=435140515) |