diff options
author | Tim Barron <tjbarron@google.com> | 2021-06-23 13:20:04 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2021-06-23 13:33:58 -0700 |
commit | 4d283b778e269315193245d4f6460d17119042e1 (patch) | |
tree | 637287af89eeb5568bd0dc1193f2dceb1d5c3b2b | |
parent | 63545535ac2423e7761dbdf977bd003f21a581e9 (diff) | |
download | icing-4d283b778e269315193245d4f6460d17119042e1.tar.gz |
Sync from upstream.
Descriptions:
==========
Switch DocumentStore to using PortableFileBackedProtoLog instead of
FileBackedProtoLog, and migrate any existing FileBackedProtoLog's to the
Portable* version.
==========
Add more fields to SchemaStats to capture the types that were newly
added, modified and modified with an index rebuild.
==========
Create a new DeleteByQueryStatsProto to capture query-specific stats
related to the performance of DeleteByQuery.
==========
Fix bugs in SetSchema:
1. Property id reassignments when a property is deleted and a new one
added was not being detected
2. Property id reassignments due to changes to the
index_nested_properties field were not detected.
3. Backward incompatible changes should invalidate dependent types.
==========
Fix IcingSearchEngineTest.CJKSnippets test by choosing a simpler
multi-byte character (when encoded in UTF-8) string that ICU will always
segment in the same way regardless of api level.
Bug: 185806837
Bug: 187205195
Bug: 187726282
Change-Id: I07cc8bf3fc5b3a7c5e8f0d24d1724297f3bfba5d
52 files changed, 2956 insertions, 396 deletions
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h index 9ccd81b..4302dce 100644 --- a/icing/file/file-backed-proto-log.h +++ b/icing/file/file-backed-proto-log.h @@ -80,23 +80,6 @@ namespace icing { namespace lib { -namespace { - -bool IsEmptyBuffer(const char* buffer, int size) { - return std::all_of(buffer, buffer + size, - [](const char byte) { return byte == 0; }); -} - -// Helper function to get stored proto size from the metadata. -// Metadata format: 8 bits magic + 24 bits size -int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } - -// Helper function to get stored proto magic from the metadata. -// Metadata format: 8 bits magic + 24 bits size -uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } - -} // namespace - template <typename ProtoT> class FileBackedProtoLog { public: @@ -402,6 +385,29 @@ class FileBackedProtoLog { const Filesystem* filesystem, const std::string& file_path, Crc32 initial_crc, int64_t start, int64_t end); + static bool IsEmptyBuffer(const char* buffer, int size) { + return std::all_of(buffer, buffer + size, + [](const char byte) { return byte == 0; }); + } + + // Helper function to get stored proto size from the metadata. + // Metadata format: 8 bits magic + 24 bits size + static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } + + // Helper function to get stored proto magic from the metadata. + // Metadata format: 8 bits magic + 24 bits size + static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } + + // Reads out the metadata of a proto located at file_offset from the file. + // + // Returns: + // Proto's metadata on success + // OUT_OF_RANGE_ERROR if file_offset exceeds file_size + // INTERNAL_ERROR if the metadata is invalid or any IO errors happen + static libtextclassifier3::StatusOr<int> ReadProtoMetadata( + MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); + std::unique_ptr<Header> header_; + // Magic number added in front of every proto. Used when reading out protos // as a first check for corruption in each entry in the file. Even if there is // a corruption, the best we can do is roll back to our last recovery point @@ -429,16 +435,6 @@ class FileBackedProtoLog { ScopedFd fd_; const Filesystem* const filesystem_; const std::string file_path_; - - // Reads out the metadata of a proto located at file_offset from the file. - // - // Returns: - // Proto's metadata on success - // OUT_OF_RANGE_ERROR if file_offset exceeds file_size - // INTERNAL_ERROR if the metadata is invalid or any IO errors happen - static libtextclassifier3::StatusOr<int> ReadProtoMetadata( - MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); - std::unique_ptr<Header> header_; }; template <typename ProtoT> @@ -573,6 +569,7 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem, ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum, ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header), file_size)); + // Double check that the log checksum is the same as the one that was // persisted last time. If not, we start recovery logic. if (header->log_checksum != calculated_log_checksum.Get()) { diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc index 766cc64..c09fd5a 100644 --- a/icing/file/file-backed-proto-log_benchmark.cc +++ b/icing/file/file-backed-proto-log_benchmark.cc @@ -164,6 +164,46 @@ BENCHMARK(BM_Read) // 16MiB, and we need some extra space for the // rest of the document properties +static void BM_Erase(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s", GetTestTempDir().c_str(), "/proto.log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = + FileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + for (auto _ : state) { + state.PauseTiming(); + ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, + proto_log->WriteProto(document)); + state.ResumeTiming(); + + testing::DoNotOptimize(proto_log->EraseProto(write_offset)); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Erase); + static void BM_ComputeChecksum(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = GetTestTempDir() + "/proto.log"; diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h index 000ab3d..825b763 100644 --- a/icing/file/portable-file-backed-proto-log.h +++ b/icing/file/portable-file-backed-proto-log.h @@ -83,28 +83,6 @@ namespace icing { namespace lib { -namespace { - -// Number of bytes we reserve for the heading at the beginning of the proto log. -// We reserve this so the header can grow without running into the contents of -// the proto log, triggering an unnecessary migration of the data. -constexpr int kHeaderReservedBytes = 256; - -bool IsEmptyBuffer(const char* buffer, int size) { - return std::all_of(buffer, buffer + size, - [](const char byte) { return byte == 0; }); -} - -// Helper function to get stored proto size from the metadata. -// Metadata format: 8 bits magic + 24 bits size -int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } - -// Helper function to get stored proto magic from the metadata. -// Metadata format: 8 bits magic + 24 bits size -uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } - -} // namespace - template <typename ProtoT> class PortableFileBackedProtoLog { public: @@ -135,6 +113,11 @@ class PortableFileBackedProtoLog { : compress(compress_in), max_proto_size(max_proto_size_in) {} }; + // Number of bytes we reserve for the heading at the beginning of the proto + // log. We reserve this so the header can grow without running into the + // contents of the proto log, triggering an unnecessary migration of the data. + static constexpr int kHeaderReservedBytes = 256; + // Header stored at the beginning of the file before the rest of the log // contents. Stores metadata on the log. class Header { @@ -541,6 +524,19 @@ class PortableFileBackedProtoLog { static libtextclassifier3::Status WriteProtoMetadata( const Filesystem* filesystem, int fd, int32_t host_order_metadata); + static bool IsEmptyBuffer(const char* buffer, int size) { + return std::all_of(buffer, buffer + size, + [](const char byte) { return byte == 0; }); + } + + // Helper function to get stored proto size from the metadata. + // Metadata format: 8 bits magic + 24 bits size + static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } + + // Helper function to get stored proto magic from the metadata. + // Metadata format: 8 bits magic + 24 bits size + static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } + // Magic number added in front of every proto. Used when reading out protos // as a first check for corruption in each entry in the file. Even if there is // a corruption, the best we can do is roll back to our last recovery point diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc index b1dfe12..04ccab0 100644 --- a/icing/file/portable-file-backed-proto-log_benchmark.cc +++ b/icing/file/portable-file-backed-proto-log_benchmark.cc @@ -163,6 +163,46 @@ BENCHMARK(BM_Read) ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is // 16MiB, and we need some extra space for the // rest of the document properties + // +static void BM_Erase(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s", GetTestTempDir().c_str(), "/proto.log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + for (auto _ : state) { + state.PauseTiming(); + ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, + proto_log->WriteProto(document)); + state.ResumeTiming(); + + testing::DoNotOptimize(proto_log->EraseProto(write_offset)); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Erase); static void BM_ComputeChecksum(benchmark::State& state) { const Filesystem filesystem; diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc index 69b8a1a..b5fee4b 100644 --- a/icing/file/portable-file-backed-proto-log_test.cc +++ b/icing/file/portable-file-backed-proto-log_test.cc @@ -113,7 +113,8 @@ TEST_F(PortableFileBackedProtoLogTest, ReservedSpaceForHeader) { // With no protos written yet, the log should be minimum the size of the // reserved header space. - ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), kHeaderReservedBytes); + ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), + PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes); } TEST_F(PortableFileBackedProtoLogTest, WriteProtoTooLarge) { @@ -417,8 +418,9 @@ TEST_F(PortableFileBackedProtoLogTest, // We still have the corrupted content in our file, we didn't throw // everything out. - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Gt(kHeaderReservedBytes)); + EXPECT_THAT( + filesystem_.GetFileSize(file_path_.c_str()), + Gt(PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes)); } } @@ -456,9 +458,10 @@ TEST_F(PortableFileBackedProtoLogTest, DocumentProto document = DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build(); std::string serialized_document = document.SerializeAsString(); - ASSERT_TRUE(filesystem_.PWrite(file_path_.c_str(), kHeaderReservedBytes, - serialized_document.data(), - serialized_document.size())); + ASSERT_TRUE(filesystem_.PWrite( + file_path_.c_str(), + PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes, + serialized_document.data(), serialized_document.size())); Header header = ReadHeader(filesystem_, file_path_); @@ -484,8 +487,9 @@ TEST_F(PortableFileBackedProtoLogTest, EXPECT_TRUE(create_result.recalculated_checksum); // We lost everything, file size is back down to the header. - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(kHeaderReservedBytes)); + EXPECT_THAT( + filesystem_.GetFileSize(file_path_.c_str()), + Eq(PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes)); // At least the log is no longer dirty. Header header = ReadHeader(filesystem_, file_path_); diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index e9865e4..55f59f2 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -502,21 +502,18 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( StatusProto* result_status = result_proto.mutable_status(); absl_ports::unique_lock l(&mutex_); + std::unique_ptr<Timer> timer = clock_->GetNewTimer(); if (!initialized_) { result_status->set_code(StatusProto::FAILED_PRECONDITION); result_status->set_message("IcingSearchEngine has not been initialized!"); - return result_proto; - } - - libtextclassifier3::Status status = SchemaUtil::Validate(new_schema); - if (!status.ok()) { - TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } auto lost_previous_schema_or = LostPreviousSchema(); if (!lost_previous_schema_or.ok()) { TransformStatus(lost_previous_schema_or.status(), result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } bool lost_previous_schema = lost_previous_schema_or.ValueOrDie(); @@ -534,10 +531,11 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( std::move(new_schema), ignore_errors_and_delete_documents); if (!set_schema_result_or.ok()) { TransformStatus(set_schema_result_or.status(), result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } - const SchemaStore::SetSchemaResult set_schema_result = - set_schema_result_or.ValueOrDie(); + SchemaStore::SetSchemaResult set_schema_result = + std::move(set_schema_result_or).ValueOrDie(); for (const std::string& deleted_type : set_schema_result.schema_types_deleted_by_name) { @@ -549,6 +547,26 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( result_proto.add_incompatible_schema_types(incompatible_type); } + for (const std::string& new_type : + set_schema_result.schema_types_new_by_name) { + result_proto.add_new_schema_types(std::move(new_type)); + } + + for (const std::string& compatible_type : + set_schema_result.schema_types_changed_fully_compatible_by_name) { + result_proto.add_fully_compatible_changed_schema_types( + std::move(compatible_type)); + } + + bool index_incompatible = + !set_schema_result.schema_types_index_incompatible_by_name.empty(); + for (const std::string& index_incompatible_type : + set_schema_result.schema_types_index_incompatible_by_name) { + result_proto.add_index_incompatible_changed_schema_types( + std::move(index_incompatible_type)); + } + + libtextclassifier3::Status status; if (set_schema_result.success) { if (lost_previous_schema) { // No previous schema to calculate a diff against. We have to go through @@ -556,6 +574,7 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( status = document_store_->UpdateSchemaStore(schema_store_.get()); if (!status.ok()) { TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } else if (!set_schema_result.old_schema_type_ids_changed.empty() || @@ -565,15 +584,17 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( set_schema_result); if (!status.ok()) { TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } - if (lost_previous_schema || set_schema_result.index_incompatible) { + if (lost_previous_schema || index_incompatible) { // Clears all index files status = index_->Reset(); if (!status.ok()) { TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } @@ -584,6 +605,7 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( if (!restore_result.status.ok() && !absl_ports::IsDataLoss(restore_result.status)) { TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } @@ -594,6 +616,7 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( result_status->set_message("Schema is incompatible."); } + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } @@ -908,8 +931,13 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } - DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats(); - delete_stats->set_delete_type(DeleteStatsProto::DeleteType::QUERY); + DeleteByQueryStatsProto* delete_stats = + result_proto.mutable_delete_by_query_stats(); + delete_stats->set_query_length(search_spec.query().length()); + delete_stats->set_num_namespaces_filtered( + search_spec.namespace_filters_size()); + delete_stats->set_num_schema_types_filtered( + search_spec.schema_type_filters_size()); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); libtextclassifier3::Status status = @@ -919,6 +947,7 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } + std::unique_ptr<Timer> component_timer = clock_->GetNewTimer(); // Gets unordered results from query processor auto query_processor_or = QueryProcessor::Create( index_.get(), language_segmenter_.get(), normalizer_.get(), @@ -937,10 +966,13 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( } QueryProcessor::QueryResults query_results = std::move(query_results_or).ValueOrDie(); + delete_stats->set_parse_query_latency_ms( + component_timer->GetElapsedMilliseconds()); ICING_VLOG(2) << "Deleting the docs that matched the query."; int num_deleted = 0; + component_timer = clock_->GetNewTimer(); while (query_results.root_iterator->Advance().ok()) { ICING_VLOG(3) << "Deleting doc " << query_results.root_iterator->doc_hit_info().document_id(); @@ -952,6 +984,13 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } } + delete_stats->set_document_removal_latency_ms( + component_timer->GetElapsedMilliseconds()); + int term_count = 0; + for (const auto& section_and_terms : query_results.query_terms) { + term_count += section_and_terms.second.size(); + } + delete_stats->set_num_terms(term_count); if (num_deleted > 0) { result_proto.mutable_status()->set_code(StatusProto::OK); diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc index b437724..316b74f 100644 --- a/icing/icing-search-engine_benchmark.cc +++ b/icing/icing-search-engine_benchmark.cc @@ -577,6 +577,120 @@ void BM_RepeatedPut(benchmark::State& state) { // cap the limit to 1 << 18. BENCHMARK(BM_RepeatedPut)->Range(/*start=*/100, /*limit=*/1 << 18); +// This is different from BM_RepeatedPut since we're just trying to benchmark +// one Put call, not thousands of them at once. +void BM_Put(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message")) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document + DocumentProto document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .SetUri("uri") + .Build(); + + for (auto s : state) { + benchmark::DoNotOptimize(icing->Put(document)); + } +} +BENCHMARK(BM_Put); + +void BM_Get(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message")) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document + DocumentProto document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .SetUri("uri") + .Build(); + + ASSERT_THAT(icing->Put(document).status(), ProtoIsOk()); + for (auto s : state) { + benchmark::DoNotOptimize( + icing->Get("namespace", "uri", GetResultSpecProto::default_instance())); + } +} +BENCHMARK(BM_Get); + +void BM_Delete(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message")) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document + DocumentProto document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .SetUri("uri") + .Build(); + + ASSERT_THAT(icing->Put(document).status(), ProtoIsOk()); + for (auto s : state) { + state.PauseTiming(); + icing->Put(document); + state.ResumeTiming(); + + benchmark::DoNotOptimize(icing->Delete("namespace", "uri")); + } +} +BENCHMARK(BM_Delete); + } // namespace } // namespace lib diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index c1de0f0..5846e8b 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -42,6 +42,7 @@ #include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" +#include "icing/store/document-log-creator.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/jni-test-helpers.h" @@ -100,9 +101,26 @@ constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = StringIndexingConfig_TokenizerType_Code_NONE; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN; +PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader( + Filesystem filesystem, const std::string& file_path) { + PortableFileBackedProtoLog<DocumentWrapper>::Header header; + filesystem.PRead(file_path.c_str(), &header, + sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header), + /*offset=*/0); + return header; +} + +void WriteDocumentLogHeader( + Filesystem filesystem, const std::string& file_path, + PortableFileBackedProtoLog<DocumentWrapper>::Header& header) { + filesystem.Write(file_path.c_str(), &header, + sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header)); +} + // For mocking purpose, we allow tests to provide a custom Filesystem. class TestIcingSearchEngine : public IcingSearchEngine { public: @@ -720,7 +738,13 @@ TEST_F(IcingSearchEngineTest, SetSchemaCompatibleVersionUpdateSucceeds) { property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + SetSchemaResultProto set_schema_result = icing.SetSchema(schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1)); } @@ -738,12 +762,20 @@ TEST_F(IcingSearchEngineTest, SetSchemaCompatibleVersionUpdateSucceeds) { property->set_property_name("title"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property = type->add_properties(); property->set_property_name("body"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); // 3. SetSchema should succeed and the version number should be updated. - EXPECT_THAT(icing.SetSchema(schema, true).status(), ProtoIsOk()); + SetSchemaResultProto set_schema_result = icing.SetSchema(schema, true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_fully_compatible_changed_schema_types() + ->Add("Email"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(2)); } @@ -929,7 +961,12 @@ TEST_F(IcingSearchEngineTest, } TEST_F(IcingSearchEngineTest, SetSchema) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(1000); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); auto message_document = CreateMessageDocument("namespace", "uri"); @@ -958,26 +995,31 @@ TEST_F(IcingSearchEngineTest, SetSchema) { empty_type->set_schema_type(""); // Make sure we can't set invalid schemas - EXPECT_THAT(icing.SetSchema(invalid_schema).status(), + SetSchemaResultProto set_schema_result = icing.SetSchema(invalid_schema); + EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); // Can add an document of a set schema - EXPECT_THAT(icing.SetSchema(schema_with_message).status(), ProtoIsOk()); + set_schema_result = icing.SetSchema(schema_with_message); + EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK)); + EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk()); // Schema with Email doesn't have Message, so would result incompatible // data - EXPECT_THAT(icing.SetSchema(schema_with_email).status(), + set_schema_result = icing.SetSchema(schema_with_email); + EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); // Can expand the set of schema types and add an document of a new // schema type - EXPECT_THAT(icing.SetSchema(SchemaProto(schema_with_email_and_message)) - .status() - .code(), - Eq(StatusProto::OK)); - EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk()); + set_schema_result = icing.SetSchema(schema_with_email_and_message); + EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK)); + EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); + EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk()); // Can't add an document whose schema isn't set auto photo_document = DocumentBuilder() .SetKey("namespace", "uri") @@ -990,7 +1032,8 @@ TEST_F(IcingSearchEngineTest, SetSchema) { HasSubstr("'Photo' not found")); } -TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { +TEST_F(IcingSearchEngineTest, + SetSchemaNewIndexedPropertyTriggersIndexRestorationAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -999,8 +1042,15 @@ TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { ->mutable_properties(0) ->clear_string_indexing_config(); - EXPECT_THAT(icing.SetSchema(schema_with_no_indexed_property).status(), - ProtoIsOk()); + SetSchemaResultProto set_schema_result = + icing.SetSchema(schema_with_no_indexed_property); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_new_schema_types()->Add("Message"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + // Nothing will be index and Search() won't return anything. EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), ProtoIsOk()); @@ -1021,8 +1071,14 @@ TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { SchemaProto schema_with_indexed_property = CreateMessageSchema(); // Index restoration should be triggered here because new schema requires more // properties to be indexed. - EXPECT_THAT(icing.SetSchema(schema_with_indexed_property).status(), - ProtoIsOk()); + set_schema_result = icing.SetSchema(schema_with_indexed_property); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Message"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); @@ -1034,6 +1090,439 @@ TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { expected_search_result_proto)); } +TEST_F(IcingSearchEngineTest, + SetSchemaChangeNestedPropertiesTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SchemaTypeConfigProto person_proto = + SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto nested_schema = + SchemaBuilder() + .AddType(person_proto) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + "Person", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_new_schema_types()->Add("Person"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + DocumentProto document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .AddDocumentProperty("sender", + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Bill Lundbergh") + .Build()) + .Build(); + + // "sender.name" should get assigned property id 0 and subject should get + // property id 1. + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + + // document should match a query for 'Bill' in 'sender.name', but not in + // 'subject' + SearchSpecProto search_spec; + search_spec.set_query("sender.name:Bill"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto result; + result.mutable_status()->set_code(StatusProto::OK); + *result.mutable_results()->Add()->mutable_document() = document; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); + + SearchResultProto empty_result; + empty_result.mutable_status()->set_code(StatusProto::OK); + search_spec.set_query("subject:Bill"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + // Now update the schema with index_nested_properties=false. This should + // reassign property ids, lead to an index rebuild and ensure that nothing + // match a query for "Bill". + SchemaProto no_nested_schema = + SchemaBuilder() + .AddType(person_proto) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + "Person", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + set_schema_result = icing.SetSchema(no_nested_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // document shouldn't match a query for 'Bill' in either 'sender.name' or + // 'subject' + search_spec.set_query("sender.name:Bill"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + search_spec.set_query("subject:Bill"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); +} + +TEST_F(IcingSearchEngineTest, + ForceSetSchemaPropertyDeletionTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // 'body' should have a property id of 0 and 'subject' should have a property + // id of 1. + SchemaProto email_with_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = + icing.SetSchema(email_with_body_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Create a document with only a subject property. + DocumentProto document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .Build(); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + + // We should be able to retrieve the document by searching for 'tps' in + // 'subject'. + SearchSpecProto search_spec; + search_spec.set_query("subject:tps"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto result; + result.mutable_status()->set_code(StatusProto::OK); + *result.mutable_results()->Add()->mutable_document() = document; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); + + // Now update the schema to remove the 'body' field. This is backwards + // incompatible, but document should be preserved because it doesn't contain a + // 'body' field. If the index is correctly rebuilt, then 'subject' will now + // have a property id of 0. If not, then the hits in the index will still have + // have a property id of 1 and therefore it won't be found. + SchemaProto email_no_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + set_schema_result = icing.SetSchema( + email_no_body_schema, /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // We should be able to retrieve the document by searching for 'tps' in + // 'subject'. + search_spec.set_query("subject:tps"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); +} + +TEST_F( + IcingSearchEngineTest, + ForceSetSchemaPropertyDeletionAndAdditionTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // 'body' should have a property id of 0 and 'subject' should have a property + // id of 1. + SchemaProto email_with_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = + icing.SetSchema(email_with_body_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Create a document with only a subject property. + DocumentProto document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .Build(); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + + // We should be able to retrieve the document by searching for 'tps' in + // 'subject'. + SearchSpecProto search_spec; + search_spec.set_query("subject:tps"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto result; + result.mutable_status()->set_code(StatusProto::OK); + *result.mutable_results()->Add()->mutable_document() = document; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); + + // Now update the schema to remove the 'body' field. This is backwards + // incompatible, but document should be preserved because it doesn't contain a + // 'body' field. If the index is correctly rebuilt, then 'subject' and 'to' + // will now have property ids of 0 and 1 respectively. If not, then the hits + // in the index will still have have a property id of 1 and therefore it won't + // be found. + SchemaProto email_no_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("to") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + set_schema_result = icing.SetSchema( + email_no_body_schema, /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // We should be able to retrieve the document by searching for 'tps' in + // 'subject'. + search_spec.set_query("subject:tps"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); +} + +TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SchemaTypeConfigProto email_schema_type = + SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument("Person", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto nested_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("company") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(email_schema_type) + .Build(); + + SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_new_schema_types()->Add("Person"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Create two documents - a person document and an email document - both docs + // should be deleted when we remove the 'company' field from the person type. + DocumentProto person_document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Person") + .SetCreationTimestampMs(1000) + .AddStringProperty("name", "Bill Lundbergh") + .AddStringProperty("company", "Initech Corp.") + .Build(); + EXPECT_THAT(icing.Put(person_document).status(), ProtoIsOk()); + + DocumentProto email_document = + DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .AddDocumentProperty("sender", person_document) + .Build(); + EXPECT_THAT(icing.Put(email_document).status(), ProtoIsOk()); + + // We should be able to retrieve both documents. + GetResultProto get_result = + icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()); + EXPECT_THAT(get_result.status(), ProtoIsOk()); + EXPECT_THAT(get_result.document(), EqualsProto(person_document)); + + get_result = + icing.Get("namespace1", "uri2", GetResultSpecProto::default_instance()); + EXPECT_THAT(get_result.status(), ProtoIsOk()); + EXPECT_THAT(get_result.document(), EqualsProto(email_document)); + + // Now update the schema to remove the 'company' field. This is backwards + // incompatible, *both* documents should be deleted because both fail + // validation (they each contain a 'Person' that has a non-existent property). + nested_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(email_schema_type) + .Build(); + + set_schema_result = icing.SetSchema( + nested_schema, /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Person"); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Person"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Both documents should be deleted now. + get_result = + icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()); + EXPECT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND)); + + get_result = + icing.Get("namespace1", "uri2", GetResultSpecProto::default_instance()); + EXPECT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND)); +} + TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -1079,6 +1568,10 @@ TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); // Can't set the schema since it's incompatible + SetSchemaResultProto set_schema_result = + icing.SetSchema(schema_with_required_subject); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_set_schema_result_proto; expected_set_schema_result_proto.mutable_status()->set_code( StatusProto::FAILED_PRECONDITION); @@ -1086,15 +1579,17 @@ TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { "Schema is incompatible."); expected_set_schema_result_proto.add_incompatible_schema_types("email"); - EXPECT_THAT(icing.SetSchema(schema_with_required_subject), - EqualsProto(expected_set_schema_result_proto)); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto)); // Force set it + set_schema_result = + icing.SetSchema(schema_with_required_subject, + /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); expected_set_schema_result_proto.mutable_status()->set_code(StatusProto::OK); expected_set_schema_result_proto.mutable_status()->clear_message(); - EXPECT_THAT(icing.SetSchema(schema_with_required_subject, - /*ignore_errors_and_delete_documents=*/true), - EqualsProto(expected_set_schema_result_proto)); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto)); GetResultProto expected_get_result_proto; expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); @@ -1151,19 +1646,25 @@ TEST_F(IcingSearchEngineTest, SetSchemaDeletesDocumentsAndReturnsOk) { type->set_schema_type("email"); // Can't set the schema since it's incompatible + SetSchemaResultProto set_schema_result = icing.SetSchema(new_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_result; expected_result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION); expected_result.mutable_status()->set_message("Schema is incompatible."); expected_result.add_deleted_schema_types("message"); - EXPECT_THAT(icing.SetSchema(new_schema), EqualsProto(expected_result)); + EXPECT_THAT(set_schema_result, EqualsProto(expected_result)); // Force set it + set_schema_result = + icing.SetSchema(new_schema, + /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); expected_result.mutable_status()->set_code(StatusProto::OK); expected_result.mutable_status()->clear_message(); - EXPECT_THAT(icing.SetSchema(new_schema, - /*ignore_errors_and_delete_documents=*/true), - EqualsProto(expected_result)); + EXPECT_THAT(set_schema_result, EqualsProto(expected_result)); // "email" document is still there GetResultProto expected_get_result_proto; @@ -2074,7 +2575,8 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldRemoveDeletedDocs) { // Deletes document1 ASSERT_THAT(icing.Delete("namespace", "uri1").status(), ProtoIsOk()); const std::string document_log_path = - icing_options.base_dir() + "/document_dir/document_log"; + icing_options.base_dir() + "/document_dir/" + + DocumentLogCreator::GetDocumentLogFilename(); int64_t document_log_size_before = filesystem()->GetFileSize(document_log_path.c_str()); ASSERT_THAT(icing.Optimize().status(), ProtoIsOk()); @@ -2765,11 +3267,16 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) { search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); DeleteByQueryResultProto result_proto = icing.DeleteByQuery(search_spec); EXPECT_THAT(result_proto.status(), ProtoIsOk()); - DeleteStatsProto exp_stats; - exp_stats.set_delete_type(DeleteStatsProto::DeleteType::QUERY); + DeleteByQueryStatsProto exp_stats; exp_stats.set_latency_ms(7); exp_stats.set_num_documents_deleted(1); - EXPECT_THAT(result_proto.delete_stats(), EqualsProto(exp_stats)); + exp_stats.set_query_length(search_spec.query().length()); + exp_stats.set_num_terms(1); + exp_stats.set_num_namespaces_filtered(0); + exp_stats.set_num_schema_types_filtered(0); + exp_stats.set_parse_query_latency_ms(7); + exp_stats.set_document_removal_latency_ms(7); + EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats)); expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); expected_get_result_proto.mutable_status()->set_message( @@ -3438,8 +3945,8 @@ TEST_F(IcingSearchEngineTest, UnableToRecoverFromCorruptDocumentLog) { EqualsProto(expected_get_result_proto)); } // This should shut down IcingSearchEngine and persist anything it needs to - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); const std::string corrupt_data = "1234"; EXPECT_TRUE(filesystem()->Write(document_log_file.c_str(), corrupt_data.data(), corrupt_data.size())); @@ -5010,74 +5517,230 @@ TEST_F(IcingSearchEngineTest, SetSchemaCanDetectPreviousSchemaWasLost) { EqualsSearchResultIgnoreStatsAndScores(empty_result)); } -TEST_F(IcingSearchEngineTest, PersistToDisk) { - GetResultProto expected_get_result_proto; - expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = - CreateMessageDocument("namespace", "uri"); - +TEST_F(IcingSearchEngineTest, ImplicitPersistToDiskFullSavesEverything) { + DocumentProto document = CreateMessageDocument("namespace", "uri"); { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + } // Destructing calls a PersistToDisk(FULL) - // Persisting shouldn't affect anything - EXPECT_THAT(icing.PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT( - icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - } // Destructing persists as well + // There should be no recovery since everything should be saved properly. + InitializeResultProto init_result = icing.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Schema is still intact. + GetSchemaResultProto expected_get_schema_result_proto; + expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); + + EXPECT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto)); + + // Documents are still intact. + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document; - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT( icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), EqualsProto(expected_get_result_proto)); + + // Index is still intact. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); // Content in the Message document. + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, ExplicitPersistToDiskFullSavesEverything) { + DocumentProto document = CreateMessageDocument("namespace", "uri"); + + // Add schema and documents to our first icing1 instance. + IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing1.PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); + + // Initialize a second icing2 instance which should have it's own memory + // space. If data from icing1 isn't being persisted to the files, then icing2 + // won't be able to see those changes. + IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); + + // There should be no recovery since everything should be saved properly. + InitializeResultProto init_result = icing2.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Schema is still intact. + GetSchemaResultProto expected_get_schema_result_proto; + expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); + + EXPECT_THAT(icing2.GetSchema(), + EqualsProto(expected_get_schema_result_proto)); + + // Documents are still intact. + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document; + + EXPECT_THAT( + icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + + // Index is still intact. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); // Content in the Message document. + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document; + + SearchResultProto actual_results = + icing2.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } -TEST_F(IcingSearchEngineTest, NoPersistToDiskLiteDoesntPersistPut) { +TEST_F(IcingSearchEngineTest, NoPersistToDiskLosesAllDocumentsAndIndex) { IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - DocumentProto document1 = CreateMessageDocument("namespace", "uri"); - EXPECT_THAT(icing1.Put(document1).status(), ProtoIsOk()); + DocumentProto document = CreateMessageDocument("namespace", "uri"); + EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); EXPECT_THAT( icing1.Get("namespace", "uri", GetResultSpecProto::default_instance()) .document(), - EqualsProto(document1)); + EqualsProto(document)); + + // It's intentional that no PersistToDisk call is made before initializing a + // second instance of icing. IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing2.Initialize().status(), ProtoIsOk()); + InitializeResultProto init_result = icing2.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::PARTIAL_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + // The document shouldn't be found because we forgot to call // PersistToDisk(LITE)! EXPECT_THAT( icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()) .status(), ProtoStatusIs(StatusProto::NOT_FOUND)); + + // Searching also shouldn't get us anything because the index wasn't + // recovered. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); // Content in the Message document. + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + + SearchResultProto actual_results = + icing2.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } -TEST_F(IcingSearchEngineTest, PersistToDiskLitePersistsPut) { +TEST_F(IcingSearchEngineTest, PersistToDiskLiteSavesGroundTruth) { + DocumentProto document = CreateMessageDocument("namespace", "uri"); + IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - DocumentProto document1 = CreateMessageDocument("namespace", "uri"); - EXPECT_THAT(icing1.Put(document1).status(), ProtoIsOk()); + EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); EXPECT_THAT(icing1.PersistToDisk(PersistType::LITE).status(), ProtoIsOk()); EXPECT_THAT( icing1.Get("namespace", "uri", GetResultSpecProto::default_instance()) .document(), - EqualsProto(document1)); + EqualsProto(document)); IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing2.Initialize().status(), ProtoIsOk()); + InitializeResultProto init_result = icing2.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + + // A checksum mismatch gets reported as an IO error. The document store and + // index didn't have their derived files included in the checksum previously, + // so reinitializing will trigger a checksum mismatch. + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + + // Schema is still intact. + GetSchemaResultProto expected_get_schema_result_proto; + expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); + + EXPECT_THAT(icing2.GetSchema(), + EqualsProto(expected_get_schema_result_proto)); + // The document should be found because we called PersistToDisk(LITE)! EXPECT_THAT( icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()) .document(), - EqualsProto(document1)); + EqualsProto(document)); + + // Recovered index is still intact. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); // Content in the Message document. + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document; + + SearchResultProto actual_results = + icing2.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, ResetOk) { @@ -5616,15 +6279,16 @@ TEST_F(IcingSearchEngineTest, RestoreIndexLoseLiteIndex) { // 2. Delete the last document from the document log { - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); filesystem()->DeleteFile(document_log_file.c_str()); - ICING_ASSERT_OK_AND_ASSIGN(auto create_result, - FileBackedProtoLog<DocumentWrapper>::Create( - filesystem(), document_log_file.c_str(), - FileBackedProtoLog<DocumentWrapper>::Options( - /*compress_in=*/true))); - std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log = + ICING_ASSERT_OK_AND_ASSIGN( + auto create_result, + PortableFileBackedProtoLog<DocumentWrapper>::Create( + filesystem(), document_log_file.c_str(), + PortableFileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true))); + std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log = std::move(create_result.proto_log); document = DocumentBuilder(document).SetUri("fake_type/0").Build(); @@ -5689,15 +6353,16 @@ TEST_F(IcingSearchEngineTest, RestoreIndexLoseIndex) { // 2. Delete the last two documents from the document log. { - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); filesystem()->DeleteFile(document_log_file.c_str()); - ICING_ASSERT_OK_AND_ASSIGN(auto create_result, - FileBackedProtoLog<DocumentWrapper>::Create( - filesystem(), document_log_file.c_str(), - FileBackedProtoLog<DocumentWrapper>::Options( - /*compress_in=*/true))); - std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log = + ICING_ASSERT_OK_AND_ASSIGN( + auto create_result, + PortableFileBackedProtoLog<DocumentWrapper>::Create( + filesystem(), document_log_file.c_str(), + PortableFileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true))); + std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log = std::move(create_result.proto_log); document = DocumentBuilder(document).SetUri("fake_type/0").Build(); @@ -5994,8 +6659,8 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCausePartialDataLoss) { // Append a non-checksummed document. This will mess up the checksum of the // proto log, forcing it to rewind and later return a DATA_LOSS error. const std::string serialized_document = document.SerializeAsString(); - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); int64_t file_size = filesystem()->GetFileSize(document_log_file.c_str()); filesystem()->PWrite(document_log_file.c_str(), file_size, @@ -6045,31 +6710,47 @@ TEST_F(IcingSearchEngineTest, .SetSchema("Message") .AddStringProperty("body", "message body") .Build(); + + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); + int64_t corruptible_offset; + { // Initialize and put a document. IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // There's some space at the beginning of the file (e.g. header, kmagic, + // etc) that is necessary to initialize the FileBackedProtoLog. We can't + // corrupt that region, so we need to figure out the offset at which + // documents will be written to - which is the file size after + // initialization. + corruptible_offset = filesystem()->GetFileSize(document_log_file.c_str()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk()); } { - // Modify the document log checksum to trigger a complete document log - // rewind. - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); - - FileBackedProtoLog<DocumentWrapper>::Header document_log_header; - filesystem()->PRead(document_log_file.c_str(), &document_log_header, - sizeof(FileBackedProtoLog<DocumentWrapper>::Header), - /*offset=*/0); - // Set a garbage checksum. - document_log_header.log_checksum = 10; - document_log_header.header_checksum = - document_log_header.CalculateHeaderChecksum(); - filesystem()->PWrite(document_log_file.c_str(), /*offset=*/0, - &document_log_header, - sizeof(FileBackedProtoLog<DocumentWrapper>::Header)); + // "Corrupt" the content written in the log. Make the corrupt document + // smaller than our original one so we don't accidentally write past our + // file. + DocumentProto document = + DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build(); + std::string serialized_document = document.SerializeAsString(); + ASSERT_TRUE(filesystem()->PWrite( + document_log_file.c_str(), corruptible_offset, + serialized_document.data(), serialized_document.size())); + + PortableFileBackedProtoLog<DocumentWrapper>::Header header = + ReadDocumentLogHeader(*filesystem(), document_log_file); + + // Set dirty bit to true to reflect that something changed in the log. + header.SetDirtyFlag(true); + header.SetHeaderChecksum(header.CalculateHeaderChecksum()); + + WriteDocumentLogHeader(*filesystem(), document_log_file, header); } { @@ -7182,6 +7863,171 @@ TEST_F(IcingSearchEngineTest, CJKSnippetTest) { EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); } +#ifndef ICING_JNI_TEST +// We skip this test case when we're running in a jni_test since the data files +// will be stored in the android-instrumented storage location, rather than the +// normal cc_library runfiles directory. To get that storage location, it's +// recommended to use the TestStorage APIs which handles different API +// levels/absolute vs relative/etc differences. Since that's only accessible on +// the java-side, and I haven't figured out a way to pass that directory path to +// this native side yet, we're just going to disable this. The functionality is +// already well-tested across 4 different emulated OS's so we're not losing much +// test coverage here. +TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) { + // Copy the testdata files into our IcingSearchEngine directory + std::string dir_without_portable_log; + if (IsAndroidX86()) { + dir_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_android_x86"); + } else if (IsAndroidArm()) { + dir_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_android_arm"); + } else if (IsIosPlatform()) { + dir_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_ios"); + } else { + dir_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_linux"); + } + + // Create dst directory that we'll initialize the IcingSearchEngine over. + std::string base_dir = GetTestBaseDir() + "_migrate"; + ASSERT_THAT(filesystem()->DeleteDirectoryRecursively(base_dir.c_str()), true); + ASSERT_THAT(filesystem()->CreateDirectoryRecursively(base_dir.c_str()), true); + + ASSERT_TRUE(filesystem()->CopyDirectory(dir_without_portable_log.c_str(), + base_dir.c_str(), + /*recursive=*/true)); + + IcingSearchEngineOptions icing_options; + icing_options.set_base_dir(base_dir); + + IcingSearchEngine icing(icing_options, GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Set up schema, this is the one used to validate documents in the testdata + // files. Do not change unless you're also updating the testdata files. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Make sure our schema is still the same as we expect. If not, there's + // definitely no way we're getting the documents back that we expect. + GetSchemaResultProto expected_get_schema_result_proto; + expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_schema_result_proto.mutable_schema() = schema; + ASSERT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto)); + + // These are the documents that are stored in the testdata files. Do not + // change unless you're also updating the testdata files. + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "foo") + .AddStringProperty("body", "bar") + .Build(); + + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("email") + .SetCreationTimestampMs(20) + .SetScore(321) + .AddStringProperty("body", "baz bat") + .Build(); + + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(30) + .SetScore(123) + .AddStringProperty("subject", "phoo") + .Build(); + + // Document 1 and 3 were put normally, and document 2 was deleted in our + // testdata files. + EXPECT_THAT(icing + .Get(document1.namespace_(), document1.uri(), + GetResultSpecProto::default_instance()) + .document(), + EqualsProto(document1)); + EXPECT_THAT(icing + .Get(document2.namespace_(), document2.uri(), + GetResultSpecProto::default_instance()) + .status(), + ProtoStatusIs(StatusProto::NOT_FOUND)); + EXPECT_THAT(icing + .Get(document3.namespace_(), document3.uri(), + GetResultSpecProto::default_instance()) + .document(), + EqualsProto(document3)); + + // Searching for "foo" should get us document1. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("foo"); + + SearchResultProto expected_document1; + expected_document1.mutable_status()->set_code(StatusProto::OK); + *expected_document1.mutable_results()->Add()->mutable_document() = document1; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(expected_document1)); + + // Searching for "baz" would've gotten us document2, except it got deleted. + // Make sure that it's cleared from our index too. + search_spec.set_query("baz"); + + SearchResultProto expected_no_documents; + expected_no_documents.mutable_status()->set_code(StatusProto::OK); + + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(expected_no_documents)); + + // Searching for "phoo" should get us document3. + search_spec.set_query("phoo"); + + SearchResultProto expected_document3; + expected_document3.mutable_status()->set_code(StatusProto::OK); + *expected_document3.mutable_results()->Add()->mutable_document() = document3; + + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(expected_document3)); +} +#endif // !ICING_JNI_TEST + } // namespace } // namespace lib } // namespace icing diff --git a/icing/portable/platform.h b/icing/portable/platform.h index 8712835..150eede 100644 --- a/icing/portable/platform.h +++ b/icing/portable/platform.h @@ -34,11 +34,19 @@ inline bool IsReverseJniTokenization() { return false; } -// Whether the running test is an Android test. -inline bool IsAndroidPlatform() { -#if defined(__ANDROID__) +// Whether we're running on android_x86 +inline bool IsAndroidX86() { +#if defined(__ANDROID__) && defined(__i386__) return true; -#endif // defined(__ANDROID__) +#endif // defined(__ANDROID__) && defined(__i386__) + return false; +} + +// Whether we're running on android_armeabi-v7a +inline bool IsAndroidArm() { +#if defined(__ANDROID__) && defined(__arm__) + return true; +#endif // defined(__ANDROID__) && defined(__arm__) return false; } diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index 7040a31..3307638 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -322,12 +322,18 @@ SchemaStore::SetSchema(const SchemaProto& new_schema, libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult> SchemaStore::SetSchema(SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) { + ICING_ASSIGN_OR_RETURN(SchemaUtil::DependencyMap new_dependency_map, + SchemaUtil::Validate(new_schema)); + SetSchemaResult result; auto schema_proto_or = GetSchema(); if (absl_ports::IsNotFound(schema_proto_or.status())) { // We don't have a pre-existing schema, so anything is valid. result.success = true; + for (const SchemaTypeConfigProto& type_config : new_schema.types()) { + result.schema_types_new_by_name.insert(type_config.schema_type()); + } } else if (!schema_proto_or.ok()) { // Real error return schema_proto_or.status(); @@ -345,10 +351,14 @@ SchemaStore::SetSchema(SchemaProto&& new_schema, // Different schema, track the differences and see if we can still write it SchemaUtil::SchemaDelta schema_delta = - SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema); - - // An incompatible index is fine, we can just reindex - result.index_incompatible = schema_delta.index_incompatible; + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + new_dependency_map); + + result.schema_types_new_by_name = std::move(schema_delta.schema_types_new); + result.schema_types_changed_fully_compatible_by_name = + std::move(schema_delta.schema_types_changed_fully_compatible); + result.schema_types_index_incompatible_by_name = + std::move(schema_delta.schema_types_index_incompatible); for (const auto& schema_type : schema_delta.schema_types_deleted) { // We currently don't support deletions, so mark this as not possible. diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index dd1edb8..b9be6c0 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -68,9 +68,6 @@ class SchemaStore { // to file. bool success = false; - // Whether the new schema changes invalidate the index. - bool index_incompatible = false; - // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if: // 1. Schema types are added in the middle of the SchemaProto // 2. Schema types are removed from the middle of the SchemaProto @@ -100,6 +97,21 @@ class SchemaStore { // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId // assigned to this SchemaTypeConfigProto in the *old* schema. std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id; + + // Schema types that were added in the new schema. Represented by the + // `schema_type` field in the SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_new_by_name; + + // Schema types that were changed in a way that was backwards compatible and + // didn't invalidate the index. Represented by the `schema_type` field in + // the SchemaTypeConfigProto. + std::unordered_set<std::string> + schema_types_changed_fully_compatible_by_name; + + // Schema types that were changed in a way that was backwards compatible, + // but invalidated the index. Represented by the `schema_type` field in the + // SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_index_incompatible_by_name; }; // Factory function to create a SchemaStore which does not take ownership diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index 69663b5..be7170f 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -49,6 +49,8 @@ using ::testing::Pointee; constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = StringIndexingConfig_TokenizerType_Code_PLAIN; @@ -101,6 +103,7 @@ TEST_F(SchemaStoreTest, CorruptSchemaError) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -136,6 +139,7 @@ TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -173,6 +177,7 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -244,6 +249,7 @@ TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) { SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); @@ -265,6 +271,7 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) { SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); @@ -301,6 +308,7 @@ TEST_F(SchemaStoreTest, SetNewSchemaOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -316,6 +324,7 @@ TEST_F(SchemaStoreTest, SetSameSchemaOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -323,6 +332,8 @@ TEST_F(SchemaStoreTest, SetSameSchemaOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema_)); // And one more for fun + result = SchemaStore::SetSchemaResult(); + result.success = true; EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -337,6 +348,7 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -347,6 +359,7 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) { schema_.clear_types(); // Set the incompatible schema + result = SchemaStore::SetSchemaResult(); result.success = false; result.schema_types_deleted_by_name.emplace("email"); result.schema_types_deleted_by_id.emplace(0); @@ -366,6 +379,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -378,6 +392,9 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) { .Build(); // Set the compatible schema + result = SchemaStore::SetSchemaResult(); + result.success = true; + result.schema_types_new_by_name.insert("new_type"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -398,6 +415,8 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); + result.schema_types_new_by_name.insert("message"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -454,6 +473,8 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); + result.schema_types_new_by_name.insert("message"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -468,6 +489,8 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { // Since we assign SchemaTypeIds based on order in the SchemaProto, this will // cause SchemaTypeIds to change + result = SchemaStore::SetSchemaResult(); + result.success = true; result.old_schema_type_ids_changed.emplace(0); // Old SchemaTypeId of "email" result.old_schema_type_ids_changed.emplace( 1); // Old SchemaTypeId of "message" @@ -479,7 +502,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); } -TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) { +TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); @@ -497,6 +520,7 @@ TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -512,16 +536,85 @@ TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) { .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); - // With a new indexed property, we'll need to reindex - result.index_incompatible = true; - // Set the compatible schema + result = SchemaStore::SetSchemaResult(); + result.success = true; + result.schema_types_index_incompatible_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); EXPECT_THAT(*actual_schema, EqualsProto(schema)); } +TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); + + // Make two schemas. One that sets index_nested_properties to false and one + // that sets it to true. + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto no_nested_index_schema = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty( + PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument("email", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaProto nested_index_schema = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty( + PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument("email", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Set schema with index_nested_properties=false to start. + SchemaStore::SetSchemaResult result; + result.success = true; + result.schema_types_new_by_name.insert("email"); + result.schema_types_new_by_name.insert("person"); + EXPECT_THAT(schema_store->SetSchema(no_nested_index_schema), + IsOkAndHolds(EqualsSetSchemaResult(result))); + ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, + schema_store->GetSchema()); + EXPECT_THAT(*actual_schema, EqualsProto(no_nested_index_schema)); + + // Set schema with index_nested_properties=true and confirm that the change to + // 'person' is index incompatible. + result = SchemaStore::SetSchemaResult(); + result.success = true; + result.schema_types_index_incompatible_by_name.insert("person"); + EXPECT_THAT(schema_store->SetSchema(nested_index_schema), + IsOkAndHolds(EqualsSetSchemaResult(result))); + ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); + EXPECT_THAT(*actual_schema, EqualsProto(nested_index_schema)); + + // Set schema with index_nested_properties=false and confirm that the change + // to 'person' is index incompatible. + result = SchemaStore::SetSchemaResult(); + result.success = true; + result.schema_types_index_incompatible_by_name.insert("person"); + EXPECT_THAT(schema_store->SetSchema(no_nested_index_schema), + IsOkAndHolds(EqualsSetSchemaResult(result))); + ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); + EXPECT_THAT(*actual_schema, EqualsProto(no_nested_index_schema)); +} + TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -540,6 +633,7 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -602,6 +696,8 @@ TEST_F(SchemaStoreTest, GetSchemaTypeId) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(first_type); + result.schema_types_new_by_name.insert(second_type); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); @@ -760,6 +856,8 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) { SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); + result.schema_types_new_by_name.insert("fullSectionsType"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index 49e7096..22bc3f6 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -37,6 +37,20 @@ namespace lib { namespace { +bool ArePropertiesEqual(const PropertyConfigProto& old_property, + const PropertyConfigProto& new_property) { + return old_property.property_name() == new_property.property_name() && + old_property.data_type() == new_property.data_type() && + old_property.schema_type() == new_property.schema_type() && + old_property.cardinality() == new_property.cardinality() && + old_property.string_indexing_config().term_match_type() == + new_property.string_indexing_config().term_match_type() && + old_property.string_indexing_config().tokenizer_type() == + new_property.string_indexing_config().tokenizer_type() && + old_property.document_indexing_config().index_nested_properties() == + new_property.document_indexing_config().index_nested_properties(); +} + bool IsCardinalityCompatible(const PropertyConfigProto& old_property, const PropertyConfigProto& new_property) { if (old_property.cardinality() < new_property.cardinality()) { @@ -95,43 +109,175 @@ bool IsTermMatchTypeCompatible(const StringIndexingConfig& old_indexed, } // namespace -libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { - // Tracks SchemaTypeConfigs that we've validated already. - std::unordered_set<std::string_view> known_schema_types; +libtextclassifier3::Status ExpandTranstiveDependencies( + const SchemaUtil::DependencyMap& child_to_direct_parent_map, + std::string_view type, + SchemaUtil::DependencyMap* expanded_child_to_parent_map, + std::unordered_set<std::string_view>* pending_expansions, + std::unordered_set<std::string_view>* orphaned_types) { + auto expanded_itr = expanded_child_to_parent_map->find(type); + if (expanded_itr != expanded_child_to_parent_map->end()) { + // We've already expanded this type. Just return. + return libtextclassifier3::Status::OK; + } + auto itr = child_to_direct_parent_map.find(type); + if (itr == child_to_direct_parent_map.end()) { + // It's an orphan. Just return. + orphaned_types->insert(type); + return libtextclassifier3::Status::OK; + } + pending_expansions->insert(type); + std::unordered_set<std::string_view> expanded_dependencies; + + // Add all of the direct parent dependencies. + expanded_dependencies.reserve(itr->second.size()); + expanded_dependencies.insert(itr->second.begin(), itr->second.end()); + + // Iterate through each direct parent and add their indirect parents. + for (std::string_view dep : itr->second) { + // 1. Check if we're in the middle of expanding this type - IOW there's a + // cycle! + if (pending_expansions->count(dep) > 0) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Infinite loop detected in type configs. '", type, + "' references itself.")); + } - // Tracks SchemaTypeConfigs that have been mentioned (by other - // SchemaTypeConfigs), but we haven't validated yet. - std::unordered_set<std::string_view> unknown_schema_types; + // 2. Expand this type as needed. + ICING_RETURN_IF_ERROR(ExpandTranstiveDependencies( + child_to_direct_parent_map, dep, expanded_child_to_parent_map, + pending_expansions, orphaned_types)); + if (orphaned_types->count(dep) > 0) { + // Dep is an orphan. Just skip to the next dep. + continue; + } - // Tracks PropertyConfigs within a SchemaTypeConfig that we've validated - // already. - std::unordered_set<std::string_view> known_property_names; + // 3. Dep has been fully expanded. Add all of its dependencies to this + // type's dependencies. + auto dep_expanded_itr = expanded_child_to_parent_map->find(dep); + expanded_dependencies.reserve(expanded_dependencies.size() + + dep_expanded_itr->second.size()); + expanded_dependencies.insert(dep_expanded_itr->second.begin(), + dep_expanded_itr->second.end()); + } + expanded_child_to_parent_map->insert( + {type, std::move(expanded_dependencies)}); + pending_expansions->erase(type); + return libtextclassifier3::Status::OK; +} - // Tracks which schemas reference other schemas. This is used to detect - // infinite loops between indexed schema references (e.g. A -> B -> C -> A). - // We could get into an infinite loop while trying to assign section ids. - // - // The key is the "child" schema that is being referenced within another - // schema. - // The value is a set of all the direct/indirect "parent" schemas that - // reference the "child" schema. - // - // For example, if A has a nested document property of type B, then A is the - // "parent" and B is the "child" and so schema_references will contain - // schema_references[B] == {A}. - std::unordered_map<std::string_view, std::unordered_set<std::string_view>> - schema_references; +// Expands the dependencies represented by the child_to_direct_parent_map to +// also include indirect parents. +// +// Ex. Suppose we have a schema with four types A, B, C, D. A has a property of +// type B and B has a property of type C. C and D only have non-document +// properties. +// +// The child to direct parent dependency map for this schema would be: +// C -> B +// B -> A +// +// This function would expand it so that A is also present as an indirect parent +// of C. +libtextclassifier3::StatusOr<SchemaUtil::DependencyMap> +ExpandTranstiveDependencies( + const SchemaUtil::DependencyMap& child_to_direct_parent_map) { + SchemaUtil::DependencyMap expanded_child_to_parent_map; + + // Types that we are expanding. + std::unordered_set<std::string_view> pending_expansions; + + // Types that have no parents that depend on them. + std::unordered_set<std::string_view> orphaned_types; + for (const auto& kvp : child_to_direct_parent_map) { + ICING_RETURN_IF_ERROR(ExpandTranstiveDependencies( + child_to_direct_parent_map, kvp.first, &expanded_child_to_parent_map, + &pending_expansions, &orphaned_types)); + } + return expanded_child_to_parent_map; +} +// Builds a transitive child-parent dependency map. 'Orphaned' types (types with +// no parents) will not be present in the map. +// +// Ex. Suppose we have a schema with four types A, B, C, D. A has a property of +// type B and B has a property of type C. C and D only have non-document +// properties. +// +// The transitive child-parent dependency map for this schema would be: +// C -> A, B +// B -> A +// +// A and D would be considered orphaned properties because no type refers to +// them. +// +// RETURNS: +// On success, a transitive child-parent dependency map of all types in the +// schema. +// INVALID_ARGUMENT if the schema contains a cycle or an undefined type. +// ALREADY_EXISTS if a schema type is specified more than once in the schema +libtextclassifier3::StatusOr<SchemaUtil::DependencyMap> +BuildTransitiveDependencyGraph(const SchemaProto& schema) { + // Child to parent map. + SchemaUtil::DependencyMap child_to_direct_parent_map; + + // Add all first-order dependencies. + std::unordered_set<std::string_view> known_types; + std::unordered_set<std::string_view> unknown_types; for (const auto& type_config : schema.types()) { std::string_view schema_type(type_config.schema_type()); - ICING_RETURN_IF_ERROR(ValidateSchemaType(schema_type)); - - // We can't have duplicate schema_types - if (!known_schema_types.insert(schema_type).second) { + if (known_types.count(schema_type) > 0) { return absl_ports::AlreadyExistsError(absl_ports::StrCat( "Field 'schema_type' '", schema_type, "' is already defined")); } - unknown_schema_types.erase(schema_type); + known_types.insert(schema_type); + unknown_types.erase(schema_type); + for (const auto& property_config : type_config.properties()) { + if (property_config.data_type() == + PropertyConfigProto::DataType::DOCUMENT) { + // Need to know what schema_type these Document properties should be + // validated against + std::string_view property_schema_type(property_config.schema_type()); + if (property_schema_type == schema_type) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Infinite loop detected in type configs. '", + schema_type, "' references itself.")); + } + if (known_types.count(property_schema_type) == 0) { + unknown_types.insert(property_schema_type); + } + auto itr = child_to_direct_parent_map.find(property_schema_type); + if (itr == child_to_direct_parent_map.end()) { + child_to_direct_parent_map.insert( + {property_schema_type, std::unordered_set<std::string_view>()}); + itr = child_to_direct_parent_map.find(property_schema_type); + } + itr->second.insert(schema_type); + } + } + } + if (!unknown_types.empty()) { + return absl_ports::InvalidArgumentError(absl_ports::StrCat( + "Undefined 'schema_type's: ", absl_ports::StrJoin(unknown_types, ","))); + } + return ExpandTranstiveDependencies(child_to_direct_parent_map); +} + +libtextclassifier3::StatusOr<SchemaUtil::DependencyMap> SchemaUtil::Validate( + const SchemaProto& schema) { + // 1. Build the dependency map. This will detect any cycles, non-existent or + // duplicate types in the schema. + ICING_ASSIGN_OR_RETURN(SchemaUtil::DependencyMap dependency_map, + BuildTransitiveDependencyGraph(schema)); + + // Tracks PropertyConfigs within a SchemaTypeConfig that we've validated + // already. + std::unordered_set<std::string_view> known_property_names; + + // 2. Validate the properties of each type. + for (const auto& type_config : schema.types()) { + std::string_view schema_type(type_config.schema_type()); + ICING_RETURN_IF_ERROR(ValidateSchemaType(schema_type)); // We only care about properties being unique within one type_config known_property_names.clear(); @@ -164,56 +310,6 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { "data_types in schema property '", schema_type, ".", property_name, "'")); } - - if (property_schema_type == schema_type) { - // The schema refers to itself. This also causes a infinite loop. - // - // TODO(b/171996137): When clients can opt out of indexing document - // properties, then we don't need to do this if the document property - // isn't indexed. We only care about infinite loops while we're trying - // to assign section ids for indexing. - return absl_ports::InvalidArgumentError( - absl_ports::StrCat("Infinite loop detected in type configs. '", - schema_type, "' references itself.")); - } - - // Need to make sure we eventually see/validate this schema_type - if (known_schema_types.count(property_schema_type) == 0) { - unknown_schema_types.insert(property_schema_type); - } - - // Start tracking the parent schemas that references this nested schema - // for infinite loop detection. - // - // TODO(b/171996137): When clients can opt out of indexing document - // properties, then we don't need to do this if the document property - // isn't indexed. We only care about infinite loops while we're trying - // to assign section ids for indexing. - std::unordered_set<std::string_view> parent_schemas; - parent_schemas.insert(schema_type); - - for (const auto& parent : parent_schemas) { - // Check for any indirect parents - auto indirect_parents_iter = schema_references.find(parent); - if (indirect_parents_iter == schema_references.end()) { - continue; - } - - // Our "parent" schema has parents as well. They're our indirect - // parents now. - for (const std::string_view& indirect_parent : - indirect_parents_iter->second) { - if (indirect_parent == property_schema_type) { - // We're our own indirect parent! Infinite loop found. - return absl_ports::InvalidArgumentError(absl_ports::StrCat( - "Infinite loop detected in type configs. '", - property_schema_type, "' references itself.")); - } - parent_schemas.insert(indirect_parent); - } - } - - schema_references.insert({property_schema_type, parent_schemas}); } ICING_RETURN_IF_ERROR(ValidateCardinality(property_config.cardinality(), @@ -227,15 +323,7 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { } } - // A Document property claimed to be of a schema_type that we never - // saw/validated - if (!unknown_schema_types.empty()) { - return absl_ports::UnknownError( - absl_ports::StrCat("Undefined 'schema_type's: ", - absl_ports::StrJoin(unknown_schema_types, ","))); - } - - return libtextclassifier3::Status::OK; + return dependency_map; } libtextclassifier3::Status SchemaUtil::ValidateSchemaType( @@ -355,9 +443,9 @@ SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs( } const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( - const SchemaProto& old_schema, const SchemaProto& new_schema) { + const SchemaProto& old_schema, const SchemaProto& new_schema, + const DependencyMap& new_schema_dependency_map) { SchemaDelta schema_delta; - schema_delta.index_incompatible = false; TypeConfigMap new_type_config_map; BuildTypeConfigMap(new_schema, &new_type_config_map); @@ -385,7 +473,29 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( // be reindexed. int32_t old_required_properties = 0; int32_t old_indexed_properties = 0; + + // If there is a different number of properties, then there must have been a + // change. + bool has_property_changed = + old_type_config.properties_size() != + new_schema_type_and_config->second.properties_size(); + bool is_incompatible = false; + bool is_index_incompatible = false; for (const auto& old_property_config : old_type_config.properties()) { + if (old_property_config.cardinality() == + PropertyConfigProto::Cardinality::REQUIRED) { + ++old_required_properties; + } + + // A non-default term_match_type indicates that this property is meant to + // be indexed. + bool is_indexed_property = + old_property_config.string_indexing_config().term_match_type() != + TermMatchType::UNKNOWN; + if (is_indexed_property) { + ++old_indexed_properties; + } + auto new_property_name_and_config = new_parsed_property_configs.property_config_map.find( old_property_config.property_name()); @@ -397,39 +507,35 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( "Previously defined property type '", old_type_config.schema_type(), ".", old_property_config.property_name(), "' was not defined in new schema"); - schema_delta.schema_types_incompatible.insert( - old_type_config.schema_type()); + is_incompatible = true; + is_index_incompatible |= is_indexed_property; continue; } const PropertyConfigProto* new_property_config = new_property_name_and_config->second; + if (!has_property_changed && + !ArePropertiesEqual(old_property_config, *new_property_config)) { + // Finally found a property that changed. + has_property_changed = true; + } if (!IsPropertyCompatible(old_property_config, *new_property_config)) { ICING_VLOG(1) << absl_ports::StrCat( "Property '", old_type_config.schema_type(), ".", old_property_config.property_name(), "' is incompatible."); - schema_delta.schema_types_incompatible.insert( - old_type_config.schema_type()); - } - - if (old_property_config.cardinality() == - PropertyConfigProto::Cardinality::REQUIRED) { - ++old_required_properties; - } - - // A non-default term_match_type indicates that this property is meant to - // be indexed. - if (old_property_config.string_indexing_config().term_match_type() != - TermMatchType::UNKNOWN) { - ++old_indexed_properties; + is_incompatible = true; } // Any change in the indexed property requires a reindexing if (!IsTermMatchTypeCompatible( old_property_config.string_indexing_config(), - new_property_config->string_indexing_config())) { - schema_delta.index_incompatible = true; + new_property_config->string_indexing_config()) || + old_property_config.document_indexing_config() + .index_nested_properties() != + new_property_config->document_indexing_config() + .index_nested_properties()) { + is_index_incompatible = true; } } @@ -444,8 +550,7 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( "New schema '", old_type_config.schema_type(), "' has REQUIRED properties that are not " "present in the previously defined schema"); - schema_delta.schema_types_incompatible.insert( - old_type_config.schema_type()); + is_incompatible = true; } // If we've gained any new indexed properties, then the section ids may @@ -457,8 +562,59 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( "Set of indexed properties in schema type '", old_type_config.schema_type(), "' has changed, required reindexing."); - schema_delta.index_incompatible = true; + is_index_incompatible = true; + } + + if (is_incompatible) { + // If this type is incompatible, then every type that depends on it might + // also be incompatible. Use the dependency map to mark those ones as + // incompatible too. + schema_delta.schema_types_incompatible.insert( + old_type_config.schema_type()); + auto parent_types_itr = + new_schema_dependency_map.find(old_type_config.schema_type()); + if (parent_types_itr != new_schema_dependency_map.end()) { + schema_delta.schema_types_incompatible.reserve( + schema_delta.schema_types_incompatible.size() + + parent_types_itr->second.size()); + schema_delta.schema_types_incompatible.insert( + parent_types_itr->second.begin(), parent_types_itr->second.end()); + } + } + + if (is_index_incompatible) { + // If this type is index incompatible, then every type that depends on it + // might also be index incompatible. Use the dependency map to mark those + // ones as index incompatible too. + schema_delta.schema_types_index_incompatible.insert( + old_type_config.schema_type()); + auto parent_types_itr = + new_schema_dependency_map.find(old_type_config.schema_type()); + if (parent_types_itr != new_schema_dependency_map.end()) { + schema_delta.schema_types_index_incompatible.reserve( + schema_delta.schema_types_index_incompatible.size() + + parent_types_itr->second.size()); + schema_delta.schema_types_index_incompatible.insert( + parent_types_itr->second.begin(), parent_types_itr->second.end()); + } } + + if (!is_incompatible && !is_index_incompatible && has_property_changed) { + schema_delta.schema_types_changed_fully_compatible.insert( + old_type_config.schema_type()); + } + + // Lastly, remove this type from the map. We know that this type can't + // come up in future iterations through the old schema types because the old + // type config has unique types. + new_type_config_map.erase(old_type_config.schema_type()); + } + + // Any types that are still present in the new_type_config_map are newly added + // types. + schema_delta.schema_types_new.reserve(new_type_config_map.size()); + for (auto& kvp : new_type_config_map) { + schema_delta.schema_types_new.insert(std::move(kvp.first)); } return schema_delta; diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h index 7b989a8..fa80b15 100644 --- a/icing/schema/schema-util.h +++ b/icing/schema/schema-util.h @@ -22,6 +22,7 @@ #include <unordered_set> #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/proto/schema.pb.h" namespace icing { @@ -32,13 +33,14 @@ class SchemaUtil { using TypeConfigMap = std::unordered_map<std::string, const SchemaTypeConfigProto>; - struct SchemaDelta { - // Whether an indexing config has changed, requiring the index to be - // regenerated. We don't list out all the types that make the index - // incompatible because our index isn't optimized for that. It's much easier - // to reset the entire index and reindex every document. - bool index_incompatible = false; + // Maps from a child type to the parent types that depend on it. + // Ex. type A has a single property of type B + // The dependency map will be { { "B", { "A" } } } + using DependencyMap = + std::unordered_map<std::string_view, + std::unordered_set<std::string_view>>; + struct SchemaDelta { // Which schema types were present in the old schema, but were deleted from // the new schema. std::unordered_set<std::string> schema_types_deleted; @@ -47,10 +49,28 @@ class SchemaUtil { // could invalidate existing Documents of that schema type. std::unordered_set<std::string> schema_types_incompatible; + // Schema types that were added in the new schema. Represented by the + // `schema_type` field in the SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_new; + + // Schema types that were changed in a way that was backwards compatible and + // didn't invalidate the index. Represented by the `schema_type` field in + // the SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_changed_fully_compatible; + + // Schema types that were changed in a way that was backwards compatible, + // but invalidated the index. Represented by the `schema_type` field in the + // SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_index_incompatible; + bool operator==(const SchemaDelta& other) const { - return index_incompatible == other.index_incompatible && - schema_types_deleted == other.schema_types_deleted && - schema_types_incompatible == other.schema_types_incompatible; + return schema_types_deleted == other.schema_types_deleted && + schema_types_incompatible == other.schema_types_incompatible && + schema_types_new == other.schema_types_new && + schema_types_changed_fully_compatible == + other.schema_types_changed_fully_compatible && + schema_types_index_incompatible == + other.schema_types_index_incompatible; } }; @@ -90,10 +110,12 @@ class SchemaUtil { // document properties can be opted out of indexing. // // Returns: + // On success, a dependency map from each child types to all parent types + // that depend on it directly or indirectly. // ALREADY_EXISTS for case 1 and 2 // INVALID_ARGUMENT for 3-13 - // OK otherwise - static libtextclassifier3::Status Validate(const SchemaProto& schema); + static libtextclassifier3::StatusOr<DependencyMap> Validate( + const SchemaProto& schema); // Creates a mapping of schema type -> schema type config proto. The // type_config_map is cleared, and then each schema-type_config_proto pair is @@ -142,7 +164,8 @@ class SchemaUtil { // // Returns a SchemaDelta that captures the aforementioned differences. static const SchemaDelta ComputeCompatibilityDelta( - const SchemaProto& old_schema, const SchemaProto& new_schema); + const SchemaProto& old_schema, const SchemaProto& new_schema, + const DependencyMap& new_schema_dependency_map); // Validates the 'property_name' field. // 1. Can't be an empty string diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index a2fc8d9..26ef4c7 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -17,6 +17,7 @@ #include <cstdint> #include <string> #include <string_view> +#include <unordered_set> #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -34,6 +35,7 @@ using ::testing::HasSubstr; // Properties/fields in a schema type constexpr char kEmailType[] = "EmailMessage"; +constexpr char kMessageType[] = "Text"; constexpr char kPersonType[] = "Person"; constexpr PropertyConfigProto_DataType_Code TYPE_DOCUMENT = @@ -63,6 +65,367 @@ constexpr TermMatchType_Code MATCH_UNKNOWN = TermMatchType_Code_UNKNOWN; constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) { + // Create a schema with the following dependencies: + // C + // / \ + // A - B E - F + // \ / + // D + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("d") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("D", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_d = + SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_e = + SchemaTypeConfigBuilder() + .SetType("E") + .AddProperty( + PropertyConfigBuilder() + .SetName("f") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("F", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_f = + SchemaTypeConfigBuilder() + .SetType("F") + .AddProperty(PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + // Provide these in alphabetical (also parent-child) order: A, B, C, D, E, F + SchemaProto schema = SchemaBuilder() + .AddType(type_a) + .AddType(type_b) + .AddType(type_c) + .AddType(type_d) + .AddType(type_e) + .AddType(type_f) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependencyMap d_map, + SchemaUtil::Validate(schema)); + EXPECT_THAT(d_map, testing::SizeIs(5)); + EXPECT_THAT(d_map["F"], + testing::UnorderedElementsAre("A", "B", "C", "D", "E")); + EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); + EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); +} + +TEST(SchemaUtilTest, DependencyGraphReverseAlphabeticalOrder) { + // Create a schema with the following dependencies: + // C + // / \ + // A - B E - F + // \ / + // D + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("d") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("D", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_d = + SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_e = + SchemaTypeConfigBuilder() + .SetType("E") + .AddProperty( + PropertyConfigBuilder() + .SetName("f") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("F", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_f = + SchemaTypeConfigBuilder() + .SetType("F") + .AddProperty(PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + // Provide these in reverse alphabetical (also child-parent) order: + // F, E, D, C, B, A + SchemaProto schema = SchemaBuilder() + .AddType(type_f) + .AddType(type_e) + .AddType(type_d) + .AddType(type_c) + .AddType(type_b) + .AddType(type_a) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependencyMap d_map, + SchemaUtil::Validate(schema)); + EXPECT_THAT(d_map, testing::SizeIs(5)); + EXPECT_THAT(d_map["F"], + testing::UnorderedElementsAre("A", "B", "C", "D", "E")); + EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); + EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); +} + +TEST(SchemaUtilTest, DependencyGraphMixedOrder) { + // Create a schema with the following dependencies: + // C + // / \ + // A - B E - F + // \ / + // D + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("d") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("D", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_d = + SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_e = + SchemaTypeConfigBuilder() + .SetType("E") + .AddProperty( + PropertyConfigBuilder() + .SetName("f") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("F", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_f = + SchemaTypeConfigBuilder() + .SetType("F") + .AddProperty(PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + // Provide these in a random order: C, E, F, A, B, D + SchemaProto schema = SchemaBuilder() + .AddType(type_c) + .AddType(type_e) + .AddType(type_f) + .AddType(type_a) + .AddType(type_b) + .AddType(type_d) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependencyMap d_map, + SchemaUtil::Validate(schema)); + EXPECT_THAT(d_map, testing::SizeIs(5)); + EXPECT_THAT(d_map["F"], + testing::UnorderedElementsAre("A", "B", "C", "D", "E")); + EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); + EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); +} + +TEST(SchemaUtilTest, TopLevelCycle) { + // Create a schema with the following dependencies: + // A - B - B - B - B.... + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + + SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, + HasSubstr("Infinite loop"))); +} + +TEST(SchemaUtilTest, MultiLevelCycle) { + // Create a schema with the following dependencies: + // A - B - C - A - B - C - A ... + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("A", /*index_nested_properties=*/true)) + .Build(); + + SchemaProto schema = + SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(SchemaUtilTest, NonExistentType) { + // Create a schema with the following dependencies: + // A - B - C - X (does not exist) + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("x") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("X", /*index_nested_properties=*/true)) + .Build(); + + SchemaProto schema = + SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + TEST(SchemaUtilTest, EmptySchemaProtoIsValid) { SchemaProto schema; ICING_ASSERT_OK(SchemaUtil::Validate(schema)); @@ -309,7 +672,7 @@ TEST(SchemaUtilTest, NoMatchingSchemaTypeIsInvalid) { .Build(); ASSERT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::UNKNOWN, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("Undefined 'schema_type'"))); } @@ -342,8 +705,10 @@ TEST(SchemaUtilTest, NewOptionalPropertyIsCompatible) { .Build(); SchemaUtil::SchemaDelta schema_delta; - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, - new_schema_with_optional), + schema_delta.schema_types_changed_fully_compatible.insert(kEmailType); + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema_with_optional, no_dependencies_map), Eq(schema_delta)); } @@ -377,8 +742,9 @@ TEST(SchemaUtilTest, NewRequiredPropertyIsIncompatible) { SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, - new_schema_with_required), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema_with_required, no_dependencies_map), Eq(schema_delta)); } @@ -412,7 +778,9 @@ TEST(SchemaUtilTest, NewSchemaMissingPropertyIsIncompatible) { SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } @@ -442,16 +810,19 @@ TEST(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) { // We can't have a new schema be more restrictive, REPEATED->OPTIONAL SchemaUtil::SchemaDelta incompatible_schema_delta; incompatible_schema_delta.schema_types_incompatible.emplace(kEmailType); + SchemaUtil::DependencyMap no_dependencies_map; EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( /*old_schema=*/less_restrictive_schema, - /*new_schema=*/more_restrictive_schema), + /*new_schema=*/more_restrictive_schema, no_dependencies_map), Eq(incompatible_schema_delta)); // We can have the new schema be less restrictive, OPTIONAL->REPEATED; SchemaUtil::SchemaDelta compatible_schema_delta; + compatible_schema_delta.schema_types_changed_fully_compatible.insert( + kEmailType); EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( /*old_schema=*/more_restrictive_schema, - /*new_schema=*/less_restrictive_schema), + /*new_schema=*/less_restrictive_schema, no_dependencies_map), Eq(compatible_schema_delta)); } @@ -480,7 +851,9 @@ TEST(SchemaUtilTest, DifferentDataTypeIsIncompatible) { SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } @@ -495,6 +868,12 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { .SetDataType(TYPE_INT) .SetCardinality(CARDINALITY_REPEATED))) .AddType(SchemaTypeConfigBuilder() + .SetType(kMessageType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() .SetType(kEmailType) .AddProperty(PropertyConfigBuilder() .SetName("Property") @@ -514,19 +893,31 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { .SetDataType(TYPE_INT) .SetCardinality(CARDINALITY_REPEATED))) .AddType(SchemaTypeConfigBuilder() + .SetType(kMessageType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() .SetType(kEmailType) - .AddProperty( - PropertyConfigBuilder() - .SetName("Property") - .SetDataTypeDocument( - kEmailType, /*index_nested_properties=*/true) - .SetCardinality(CARDINALITY_REPEATED))) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeDocument( + kMessageType, + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) .Build(); SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), - Eq(schema_delta)); + // kEmailType depends on kMessageType + SchemaUtil::DependencyMap dependencies_map = {{kMessageType, {kEmailType}}}; + SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema, dependencies_map); + EXPECT_THAT(actual, Eq(schema_delta)); + EXPECT_THAT(actual.schema_types_incompatible, + testing::ElementsAre(kEmailType)); + EXPECT_THAT(actual.schema_types_deleted, testing::IsEmpty()); } TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { @@ -555,16 +946,19 @@ TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { .Build(); SchemaUtil::SchemaDelta schema_delta; - schema_delta.index_incompatible = true; + schema_delta.schema_types_index_incompatible.insert(kPersonType); // New schema gained a new indexed property. + SchemaUtil::DependencyMap no_dependencies_map; EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( - schema_with_indexed_property, schema_with_unindexed_property), + schema_with_indexed_property, schema_with_unindexed_property, + no_dependencies_map), Eq(schema_delta)); // New schema lost an indexed property. EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( - schema_with_indexed_property, schema_with_unindexed_property), + schema_with_indexed_property, schema_with_unindexed_property, + no_dependencies_map), Eq(schema_delta)); } @@ -599,8 +993,10 @@ TEST(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) { .Build(); SchemaUtil::SchemaDelta schema_delta; - schema_delta.index_incompatible = true; - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + schema_delta.schema_types_index_incompatible.insert(kPersonType); + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } @@ -637,7 +1033,10 @@ TEST(SchemaUtilTest, AddingTypeIsCompatible) { .Build(); SchemaUtil::SchemaDelta schema_delta; - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + schema_delta.schema_types_new.insert(kEmailType); + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } @@ -676,10 +1075,105 @@ TEST(SchemaUtilTest, DeletingTypeIsNoted) { SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_deleted.emplace(kPersonType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } +TEST(SchemaUtilTest, DeletingPropertyAndChangingProperty) { + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property2") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + // Remove Property2 and make Property1 indexed now. Removing Property2 should + // be incompatible. + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property1") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_incompatible.emplace(kEmailType); + schema_delta.schema_types_index_incompatible.emplace(kEmailType); + SchemaUtil::DependencyMap no_dependencies_map; + SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema, no_dependencies_map); + EXPECT_THAT(actual, Eq(schema_delta)); +} + +TEST(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) { + // Make two schemas. One that sets index_nested_properties to false and one + // that sets it to true. + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto no_nested_index_schema = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaProto nested_index_schema = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Going from index_nested_properties=false to index_nested_properties=true + // should make kPersonType index_incompatible. kEmailType should be + // unaffected. + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_index_incompatible.emplace(kPersonType); + SchemaUtil::DependencyMap dependencies_map = {{kEmailType, {kPersonType}}}; + SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( + no_nested_index_schema, nested_index_schema, dependencies_map); + EXPECT_THAT(actual, Eq(schema_delta)); + + // Going from index_nested_properties=true to index_nested_properties=false + // should also make kPersonType index_incompatible. kEmailType should be + // unaffected. + actual = SchemaUtil::ComputeCompatibilityDelta( + nested_index_schema, no_nested_index_schema, dependencies_map); + EXPECT_THAT(actual, Eq(schema_delta)); +} + TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) { SchemaProto schema = SchemaBuilder() diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc new file mode 100644 index 0000000..5e0426e --- /dev/null +++ b/icing/store/document-log-creator.cc @@ -0,0 +1,196 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/store/document-log-creator.h" + +#include <memory> +#include <string> +#include <utility> + +#include "icing/text_classifier/lib3/utils/base/logging.h" +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/annotate.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/file-backed-proto-log.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/proto/document_wrapper.pb.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Used in DocumentId mapper to mark a document as deleted +constexpr char kDocumentLogFilename[] = "document_log"; + +std::string DocumentLogFilenameV0() { + // Originally only had this one version, no suffix. + return kDocumentLogFilename; +} + +std::string DocumentLogFilenameV1() { + return absl_ports::StrCat(kDocumentLogFilename, "_v1"); +} + +std::string MakeDocumentLogFilenameV0(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", DocumentLogFilenameV0()); +} + +std::string MakeDocumentLogFilenameV1(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", DocumentLogFilenameV1()); +} + +} // namespace + +std::string DocumentLogCreator::GetDocumentLogFilename() { + // This should always return the latest version of the document log in use. + // The current latest version is V1. + return DocumentLogFilenameV1(); +} + +libtextclassifier3::StatusOr<DocumentLogCreator::CreateResult> +DocumentLogCreator::Create(const Filesystem* filesystem, + const std::string& base_dir) { + bool v0_exists = + filesystem->FileExists(MakeDocumentLogFilenameV0(base_dir).c_str()); + bool v1_exists = + filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str()); + + bool regen_derived_files = false; + if (v0_exists && !v1_exists) { + ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir)); + + // Need to regenerate derived files since documents may be written to a + // different file offset in the log. + regen_derived_files = true; + } else if (!v1_exists) { + // First time initializing a v1 log. There are no existing derived files at + // this point, so we should generate some. "regenerate" here also means + // "generate for the first time", i.e. we shouldn't expect there to be any + // existing derived files. + regen_derived_files = true; + } + + ICING_ASSIGN_OR_RETURN( + PortableFileBackedProtoLog<DocumentWrapper>::CreateResult + log_create_result, + PortableFileBackedProtoLog<DocumentWrapper>::Create( + filesystem, MakeDocumentLogFilenameV1(base_dir), + PortableFileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true))); + + CreateResult create_result = {std::move(log_create_result), + regen_derived_files}; + return create_result; +} + +libtextclassifier3::Status DocumentLogCreator::MigrateFromV0ToV1( + const Filesystem* filesystem, const std::string& base_dir) { + ICING_VLOG(1) << "Migrating from v0 to v1 document log."; + + // Our v0 proto log was non-portable, create it so we can read protos out from + // it. + auto v0_create_result_or = FileBackedProtoLog<DocumentWrapper>::Create( + filesystem, MakeDocumentLogFilenameV0(base_dir), + FileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true)); + if (!v0_create_result_or.ok()) { + return absl_ports::Annotate( + v0_create_result_or.status(), + "Failed to initialize v0 document log while migrating."); + return v0_create_result_or.status(); + } + FileBackedProtoLog<DocumentWrapper>::CreateResult v0_create_result = + std::move(v0_create_result_or).ValueOrDie(); + std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> v0_proto_log = + std::move(v0_create_result.proto_log); + + // Create a v1 portable proto log that we will write our protos to. + auto v1_create_result_or = + PortableFileBackedProtoLog<DocumentWrapper>::Create( + filesystem, MakeDocumentLogFilenameV1(base_dir), + PortableFileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true)); + if (!v1_create_result_or.ok()) { + return absl_ports::Annotate( + v1_create_result_or.status(), + "Failed to initialize v1 document log while migrating."); + } + PortableFileBackedProtoLog<DocumentWrapper>::CreateResult v1_create_result = + std::move(v1_create_result_or).ValueOrDie(); + std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> v1_proto_log = + std::move(v1_create_result.proto_log); + + // Dummy empty document to be used when copying over deleted documents. + DocumentProto empty_document; + + // Start reading out from the old log and putting them in the new log. + auto iterator = v0_proto_log->GetIterator(); + auto iterator_status = iterator.Advance(); + while (iterator_status.ok()) { + libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or = + v0_proto_log->ReadProto(iterator.GetOffset()); + + bool deleted_document = false; + DocumentWrapper document_wrapper; + if (absl_ports::IsNotFound(document_wrapper_or.status())) { + // Proto was erased, we can skip copying this into our new log. + *document_wrapper.mutable_document() = empty_document; + deleted_document = true; + } else if (!document_wrapper_or.ok()) { + // Some real error, pass up + return document_wrapper_or.status(); + } else { + document_wrapper = std::move(document_wrapper_or).ValueOrDie(); + } + + auto offset_or = v1_proto_log->WriteProto(document_wrapper); + if (!offset_or.ok()) { + return absl_ports::Annotate( + offset_or.status(), + "Failed to write proto to v1 document log while migrating."); + } + + // If the original document was deleted, erase the proto we just wrote. + // We do this to maintain the document_ids, i.e. we still want document_id 2 + // to point to a deleted document even though we may not have the document + // contents anymore. DocumentStore guarantees that the document_ids don't + // change unless an Optimize is triggered. + if (deleted_document) { + int64_t offset = offset_or.ValueOrDie(); + auto erased_status = v1_proto_log->EraseProto(offset); + if (!erased_status.ok()) { + return absl_ports::Annotate( + erased_status, + "Failed to erase proto in v1 document log while migrating."); + } + } + + iterator_status = iterator.Advance(); + } + + // Close out our file log pointers. + v0_proto_log.reset(); + v1_proto_log.reset(); + + return libtextclassifier3::Status::OK; +} + +} // namespace lib +} // namespace icing diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h new file mode 100644 index 0000000..51cf497 --- /dev/null +++ b/icing/store/document-log-creator.h @@ -0,0 +1,77 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_STORE_DOCUMENT_LOG_CREATOR_H_ +#define ICING_STORE_DOCUMENT_LOG_CREATOR_H_ + +#include <string> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/proto/document_wrapper.pb.h" + +namespace icing { +namespace lib { + +// Handles creation of the document log and any underlying migrations that may +// be necessary. +class DocumentLogCreator { + public: + struct CreateResult { + // The create result passed up from the PortableFileBackedProtoLog::Create. + // Contains the document log. + PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result; + + // Whether the caller needs to also regenerate/generate any derived files + // based off of the initialized document log. + bool regen_derived_files; + }; + + // Creates the document log in the base_dir. Will create one if it doesn't + // already exist. + // + // This also handles any potential migrations from old document log versions. + // At the end of this call, the most up-to-date log will be returned and will + // be usable. + // + // Returns: + // CreateResult on success. + // INTERNAL on any I/O error. + static libtextclassifier3::StatusOr<DocumentLogCreator::CreateResult> Create( + const Filesystem* filesystem, const std::string& base_dir); + + // Returns the filename of the document log, without any directory prefixes. + // Used mainly for testing purposes. + static std::string GetDocumentLogFilename(); + + private: + // Handles migrating a v0 document log (not portable) to a v1 document log + // (portable). This will initialize the log in the beginning, and close it + // when migration is done. Callers will need to reinitialize the log on their + // own. + // + // Returns: + // OK on success. + // INVALID_ARGUMENT if some invalid option was passed to the document log. + // INTERNAL on I/O error. + static libtextclassifier3::Status MigrateFromV0ToV1( + const Filesystem* filesystem, const std::string& base_dir); +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_STORE_DOCUMENT_LOG_CREATOR_H_ diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 4e63b90..fc797b4 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -33,6 +33,7 @@ #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" +#include "icing/file/portable-file-backed-proto-log.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" @@ -44,6 +45,7 @@ #include "icing/store/document-associated-score-data.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" +#include "icing/store/document-log-creator.h" #include "icing/store/key-mapper.h" #include "icing/store/namespace-id.h" #include "icing/store/usage-store.h" @@ -93,10 +95,6 @@ std::string MakeDocumentIdMapperFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename); } -std::string MakeDocumentLogFilename(const std::string& base_dir) { - return absl_ports::StrCat(base_dir, "/", kDocumentLogFilename); -} - std::string MakeScoreCacheFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename); } @@ -224,30 +222,36 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( bool force_recovery_and_revalidate_documents, InitializeStatsProto* initialize_stats) { - auto create_result_or = FileBackedProtoLog<DocumentWrapper>::Create( - filesystem_, MakeDocumentLogFilename(base_dir_), - FileBackedProtoLog<DocumentWrapper>::Options( - /*compress_in=*/true)); + auto create_result_or = DocumentLogCreator::Create(filesystem_, base_dir_); + // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. if (!create_result_or.ok()) { ICING_LOG(ERROR) << create_result_or.status().error_message() - << "\nFailed to initialize DocumentLog"; + << "\nFailed to initialize DocumentLog."; return create_result_or.status(); } - FileBackedProtoLog<DocumentWrapper>::CreateResult create_result = + DocumentLogCreator::CreateResult create_result = std::move(create_result_or).ValueOrDie(); - document_log_ = std::move(create_result.proto_log); - if (force_recovery_and_revalidate_documents || - create_result.has_data_loss()) { - if (create_result.has_data_loss() && initialize_stats != nullptr) { + document_log_ = std::move(create_result.log_create_result.proto_log); + + if (create_result.regen_derived_files || + force_recovery_and_revalidate_documents || + create_result.log_create_result.has_data_loss()) { + // We can't rely on any existing derived files. Recreate them from scratch. + // Currently happens if: + // 1) This is a new log and we don't have derived files yet + // 2) Client wanted us to force a regeneration. + // 3) Log has some data loss, can't rely on existing derived data. + if (create_result.log_create_result.has_data_loss() && + initialize_stats != nullptr) { ICING_LOG(WARNING) << "Data loss in document log, regenerating derived files."; initialize_stats->set_document_store_recovery_cause( InitializeStatsProto::DATA_LOSS); - if (create_result.data_loss == DataLoss::PARTIAL) { + if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) { // Ground truth is partially lost. initialize_stats->set_document_store_data_status( InitializeStatsProto::PARTIAL_LOSS); @@ -257,10 +261,16 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( InitializeStatsProto::COMPLETE_LOSS); } } + std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); libtextclassifier3::Status status = RegenerateDerivedFiles(force_recovery_and_revalidate_documents); - if (initialize_stats != nullptr) { + if (initialize_stats != nullptr && + (force_recovery_and_revalidate_documents || + create_result.log_create_result.has_data_loss())) { + // Only consider it a recovery if the client forced a recovery or there + // was data loss. Otherwise, this could just be the first time we're + // initializing and generating derived files. initialize_stats->set_document_store_recovery_latency_ms( document_recovery_timer->GetElapsedMilliseconds()); } @@ -270,7 +280,7 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( return status; } } else { - if (!InitializeDerivedFiles().ok()) { + if (!InitializeExistingDerivedFiles().ok()) { ICING_VLOG(1) << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for DocumentStore."; @@ -296,10 +306,10 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( initialize_stats->set_num_documents(document_id_mapper_->num_elements()); } - return create_result.data_loss; + return create_result.log_create_result.data_loss; } -libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() { +libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() { if (!HeaderExists()) { // Without a header, we don't know if things are consistent between each // other so the caller should just regenerate everything from ground diff --git a/icing/store/document-store.h b/icing/store/document-store.h index b0cd1ce..79d99d4 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -26,6 +26,7 @@ #include "icing/file/file-backed-proto-log.h" #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/logging.pb.h" @@ -438,7 +439,7 @@ class DocumentStore { // A log used to store all documents, it serves as a ground truth of doc // store. key_mapper_ and document_id_mapper_ can be regenerated from it. - std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log_; + std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; // Key (namespace + uri) to DocumentId mapping std::unique_ptr<KeyMapper<DocumentId>> document_key_mapper_; @@ -495,11 +496,35 @@ class DocumentStore { bool force_recovery_and_revalidate_documents, InitializeStatsProto* initialize_stats); + // Initializes a new DocumentStore and sets up any underlying files. + // + // Returns: + // Data loss status on success, effectively always DataLoss::NONE + // INTERNAL on I/O error + libtextclassifier3::StatusOr<DataLoss> InitializeNewStore( + InitializeStatsProto* initialize_stats); + + // Initializes a DocumentStore over an existing directory of files. + // + // stats will be set if non-null + // + // Returns: + // Data loss status on success + // INTERNAL on I/O error + libtextclassifier3::StatusOr<DataLoss> InitializeExistingStore( + bool force_recovery_and_revalidate_documents, + InitializeStatsProto* initialize_stats); + + libtextclassifier3::StatusOr<DataLoss> MigrateFromV0ToV1( + InitializeStatsProto* initialize_stats); + // Creates sub-components and verifies the integrity of each sub-component. + // This assumes that the the underlying files already exist, and will return + // an error if it doesn't find what it's expecting. // // Returns an error if subcomponents failed to initialize successfully. // INTERNAL_ERROR on IO error - libtextclassifier3::Status InitializeDerivedFiles(); + libtextclassifier3::Status InitializeExistingDerivedFiles(); // Re-generates all files derived from the ground truth: the document log. // diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc index f68e115..ce608fc 100644 --- a/icing/store/document-store_benchmark.cc +++ b/icing/store/document-store_benchmark.cc @@ -168,6 +168,93 @@ void BM_DoesDocumentExistBenchmark(benchmark::State& state) { } BENCHMARK(BM_DoesDocumentExistBenchmark); +void BM_Put(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + DestructibleDirectory ddir(filesystem, directory); + + std::string document_store_dir = directory + "/store"; + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + DocumentProto document = CreateDocument("namespace", "uri"); + + for (auto s : state) { + // It's ok that this is the same document over and over. We'll create a new + // document_id for it and still insert the proto into the underlying log. + benchmark::DoNotOptimize(document_store->Put(document)); + } +} +BENCHMARK(BM_Put); + +void BM_GetSameDocument(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + DestructibleDirectory ddir(filesystem, directory); + + std::string document_store_dir = directory + "/store"; + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + ICING_ASSERT_OK(document_store->Put(CreateDocument("namespace", "uri"))); + + for (auto s : state) { + benchmark::DoNotOptimize(document_store->Get("namespace", "uri")); + } +} +BENCHMARK(BM_GetSameDocument); + +void BM_Delete(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + DestructibleDirectory ddir(filesystem, directory); + + std::string document_store_dir = directory + "/store"; + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + DocumentProto document = CreateDocument("namespace", "uri"); + + for (auto s : state) { + state.PauseTiming(); + ICING_ASSERT_OK(document_store->Put(document)); + state.ResumeTiming(); + + benchmark::DoNotOptimize(document_store->Delete("namespace", "uri")); + } +} +BENCHMARK(BM_Delete); + } // namespace } // namespace lib diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index ad3b7c4..b1a6357 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -15,6 +15,7 @@ #include "icing/store/document-store.h" #include <cstdint> +#include <filesystem> #include <limits> #include <memory> #include <string> @@ -40,6 +41,7 @@ #include "icing/store/corpus-id.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" +#include "icing/store/document-log-creator.h" #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" @@ -105,6 +107,22 @@ UsageReport CreateUsageReport(std::string name_space, std::string uri, return usage_report; } +PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader( + Filesystem filesystem, const std::string& file_path) { + PortableFileBackedProtoLog<DocumentWrapper>::Header header; + filesystem.PRead(file_path.c_str(), &header, + sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header), + /*offset=*/0); + return header; +} + +void WriteDocumentLogHeader( + Filesystem filesystem, const std::string& file_path, + PortableFileBackedProtoLog<DocumentWrapper>::Header& header) { + filesystem.Write(file_path.c_str(), &header, + sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header)); +} + class DocumentStoreTest : public ::testing::Test { protected: DocumentStoreTest() @@ -452,14 +470,18 @@ TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { // Validates that deleting something non-existing won't append anything to // ground truth int64_t document_log_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_THAT( document_store->Delete("nonexistent_namespace", "nonexistent_uri"), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); int64_t document_log_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } @@ -538,13 +560,17 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { // Validates that deleting something non-existing won't append anything to // ground truth int64_t document_log_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace").status, StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); int64_t document_log_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } @@ -607,7 +633,9 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { EXPECT_THAT(group_result.num_docs_deleted, Eq(2)); document_log_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); } // Destructors should update checksum and persist all data to file. CorruptDocStoreHeaderChecksumFile(); @@ -621,7 +649,9 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { // Make sure we didn't add anything to the ground truth after we recovered. int64_t document_log_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_EQ(document_log_size_before, document_log_size_after); EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()), @@ -730,13 +760,17 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { // Validates that deleting something non-existing won't append anything to // ground truth int64_t document_log_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type").status, StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); int64_t document_log_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } @@ -809,7 +843,9 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { EXPECT_THAT(group_result.num_docs_deleted, Eq(1)); document_log_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); } // Destructors should update checksum and persist all data to file. CorruptDocStoreHeaderChecksumFile(); @@ -823,7 +859,9 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { // Make sure we didn't add anything to the ground truth after we recovered. int64_t document_log_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_EQ(document_log_size_before, document_log_size_after); EXPECT_THAT(document_store->Get(email_document_id), @@ -901,7 +939,9 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { IsOkAndHolds(EqualsProto(message_document))); document_log_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); } // Destructors should update checksum and persist all data to file. CorruptDocStoreHeaderChecksumFile(); @@ -923,7 +963,9 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { // Make sure we didn't add anything to the ground truth after we recovered. int64_t document_log_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_EQ(document_log_size_before, document_log_size_after); EXPECT_THAT(document_store->Get(email_document_id), @@ -968,7 +1010,9 @@ TEST_F(DocumentStoreTest, OptimizeInto) { ICING_ASSERT_OK(doc_store->Put(document2)); ICING_ASSERT_OK(doc_store->Put(document3)); - std::string original_document_log = document_store_dir_ + "/document_log"; + std::string original_document_log = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); + int64_t original_size = filesystem_.GetFileSize(original_document_log.c_str()); @@ -979,7 +1023,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) { HasSubstr("directory is the same"))); std::string optimized_dir = document_store_dir_ + "_optimize"; - std::string optimized_document_log = optimized_dir + "/document_log"; + std::string optimized_document_log = + optimized_dir + "/" + DocumentLogCreator::GetDocumentLogFilename(); // Validates that the optimized document log has the same size if nothing is // deleted @@ -1067,8 +1112,8 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) { DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); const std::string serialized_document = document.SerializeAsString(); - const std::string document_log_file = - absl_ports::StrCat(document_store_dir_, "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str()); filesystem_.PWrite(document_log_file.c_str(), file_size, serialized_document.data(), serialized_document.size()); @@ -2919,8 +2964,8 @@ TEST_F(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) { DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); const std::string serialized_document = document.SerializeAsString(); - const std::string document_log_file = - absl_ports::StrCat(document_store_dir_, "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str()); filesystem_.PWrite(document_log_file.c_str(), file_size, serialized_document.data(), serialized_document.size()); @@ -3043,7 +3088,9 @@ TEST_F(DocumentStoreTest, DetectPartialDataLoss) { const std::string serialized_document = document.SerializeAsString(); const std::string document_log_file = - absl_ports::StrCat(document_store_dir_, "/document_log"); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str(); int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str()); filesystem_.PWrite(document_log_file.c_str(), file_size, serialized_document.data(), serialized_document.size()); @@ -3060,8 +3107,8 @@ TEST_F(DocumentStoreTest, DetectPartialDataLoss) { TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { int64_t corruptible_offset; - const std::string document_log_file = - absl_ports::StrCat(document_store_dir_, "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); { // Can put and delete fine. ICING_ASSERT_OK_AND_ASSIGN( @@ -3088,8 +3135,30 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { // "Corrupt" the persisted content written in the log. We can't recover if // the persisted data was corrupted. std::string corruption = "abc"; - filesystem_.PWrite(document_log_file.c_str(), /*offset=*/corruptible_offset, - corruption.data(), corruption.size()); + filesystem_.PWrite(document_log_file.c_str(), + /*offset=*/corruptible_offset, corruption.data(), + corruption.size()); + + { + // "Corrupt" the content written in the log. Make the corrupt document + // smaller than our original one so we don't accidentally write past our + // file. + DocumentProto document = + DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build(); + std::string serialized_document = document.SerializeAsString(); + ASSERT_TRUE(filesystem_.PWrite( + document_log_file.c_str(), corruptible_offset, + serialized_document.data(), serialized_document.size())); + + PortableFileBackedProtoLog<DocumentWrapper>::Header header = + ReadDocumentLogHeader(filesystem_, document_log_file); + + // Set dirty bit to true to reflect that something changed in the log. + header.SetDirtyFlag(true); + header.SetHeaderChecksum(header.CalculateHeaderChecksum()); + + WriteDocumentLogHeader(filesystem_, document_log_file, header); + } // Successfully recover from a data loss issue. ICING_ASSERT_OK_AND_ASSIGN( @@ -3106,8 +3175,8 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { // the document store header. // // This causes a problem now because this cl changes behavior to not consider an -// InitializeDerivedFiles failure to be a recovery if there is nothing to -// recover because the doocument store is empty. +// InitializeExistingDerivedFiles failure to be a recovery if there is nothing +// to recover because the doocument store is empty. #define DISABLE_BACKWARDS_COMPAT_TEST #ifndef DISABLE_BACKWARDS_COMPAT_TEST TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { @@ -3667,6 +3736,126 @@ TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { } } +TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { + // Set up schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + std::string schema_store_dir = schema_store_dir_ + "_migrate"; + filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); + + ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + + // Create dst directory that we'll initialize the DocumentStore over. + std::string document_store_dir = document_store_dir_ + "_migrate"; + ASSERT_THAT( + filesystem_.DeleteDirectoryRecursively(document_store_dir.c_str()), true); + ASSERT_THAT( + filesystem_.CreateDirectoryRecursively(document_store_dir.c_str()), true); + + // Copy the testdata files into our DocumentStore directory + std::string document_store_without_portable_log; + if (IsAndroidX86()) { + document_store_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_android_x86/document_dir"); + } else if (IsAndroidArm()) { + document_store_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_android_arm/document_dir"); + } else if (IsIosPlatform()) { + document_store_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_ios/document_dir"); + } else { + document_store_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_linux/document_dir"); + } + + ASSERT_TRUE(filesystem_.CopyDirectory( + document_store_without_portable_log.c_str(), document_store_dir.c_str(), + /*recursive=*/true)); + + // Initialize the DocumentStore over our copied files. + InitializeStatsProto initialize_stats; + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir, &fake_clock_, + schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + &initialize_stats)); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + // These are the documents that are stored in the testdata files. Do not + // change unless you're also updating the testdata files. + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "foo") + .AddStringProperty("body", "bar") + .Build(); + + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("email") + .SetCreationTimestampMs(20) + .SetScore(321) + .AddStringProperty("body", "baz bat") + .Build(); + + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(30) + .SetScore(123) + .AddStringProperty("subject", "phoo") + .Build(); + + // Check that we didn't lose anything. A migration also doesn't technically + // count as a recovery. + EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); + EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause()); + + // Document 1 and 3 were put normally, and document 2 was deleted in our + // testdata files. + // + // Check by namespace, uri + EXPECT_THAT(document_store->Get(document1.namespace_(), document1.uri()), + IsOkAndHolds(EqualsProto(document1))); + EXPECT_THAT(document_store->Get(document2.namespace_(), document2.uri()), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(document3.namespace_(), document3.uri()), + IsOkAndHolds(EqualsProto(document3))); + + // Check by document_id + EXPECT_THAT(document_store->Get(/*document_id=*/0), + IsOkAndHolds(EqualsProto(document1))); + EXPECT_THAT(document_store->Get(/*document_id=*/1), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(/*document_id=*/2), + IsOkAndHolds(EqualsProto(document3))); +} + } // namespace } // namespace lib diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..67a5bbd --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..70ec244 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..6177f48 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_arm/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_arm/index_dir/idx/lite.lexicon.h b/icing/testdata/not_portable_log/icing_search_engine_android_arm/index_dir/idx/lite.lexicon.h Binary files differnew file mode 100644 index 0000000..624444e --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_arm/index_dir/idx/lite.lexicon.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_arm/index_dir/idx/main/main-lexicon.h b/icing/testdata/not_portable_log/icing_search_engine_android_arm/index_dir/idx/main/main-lexicon.h Binary files differnew file mode 100644 index 0000000..82d6712 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_arm/index_dir/idx/main/main-lexicon.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_arm/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_android_arm/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..d1b1cc1 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_arm/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..67a5bbd --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..70ec244 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..6177f48 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_x86/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_x86/index_dir/idx/lite.lexicon.h b/icing/testdata/not_portable_log/icing_search_engine_android_x86/index_dir/idx/lite.lexicon.h Binary files differnew file mode 100644 index 0000000..624444e --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_x86/index_dir/idx/lite.lexicon.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_x86/index_dir/idx/main/main-lexicon.h b/icing/testdata/not_portable_log/icing_search_engine_android_x86/index_dir/idx/main/main-lexicon.h Binary files differnew file mode 100644 index 0000000..82d6712 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_x86/index_dir/idx/main/main-lexicon.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_android_x86/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_android_x86/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..d1b1cc1 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_android_x86/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..67a5bbd --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..70ec244 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..6177f48 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_ios/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_ios/index_dir/idx/lite.lexicon.h b/icing/testdata/not_portable_log/icing_search_engine_ios/index_dir/idx/lite.lexicon.h Binary files differnew file mode 100644 index 0000000..624444e --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_ios/index_dir/idx/lite.lexicon.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_ios/index_dir/idx/main/main-lexicon.h b/icing/testdata/not_portable_log/icing_search_engine_ios/index_dir/idx/main/main-lexicon.h Binary files differnew file mode 100644 index 0000000..82d6712 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_ios/index_dir/idx/main/main-lexicon.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_ios/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_ios/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..d1b1cc1 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_ios/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..67a5bbd --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/corpus_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..70ec244 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..6177f48 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_linux/document_dir/namespace_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_linux/index_dir/idx/lite.lexicon.h b/icing/testdata/not_portable_log/icing_search_engine_linux/index_dir/idx/lite.lexicon.h Binary files differnew file mode 100644 index 0000000..c95dba8 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_linux/index_dir/idx/lite.lexicon.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_linux/index_dir/idx/main/main-lexicon.h b/icing/testdata/not_portable_log/icing_search_engine_linux/index_dir/idx/main/main-lexicon.h Binary files differnew file mode 100644 index 0000000..6b87b9c --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_linux/index_dir/idx/main/main-lexicon.h diff --git a/icing/testdata/not_portable_log/icing_search_engine_linux/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h b/icing/testdata/not_portable_log/icing_search_engine_linux/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h Binary files differnew file mode 100644 index 0000000..d1b1cc1 --- /dev/null +++ b/icing/testdata/not_portable_log/icing_search_engine_linux/schema_dir/schema_type_mapper/key_mapper_dir/key_mapper.h diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h index 8d8bdf2..f83fe0a 100644 --- a/icing/testing/common-matchers.h +++ b/icing/testing/common-matchers.h @@ -121,7 +121,6 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { const SchemaStore::SetSchemaResult& actual = arg; if (actual.success == expected.success && - actual.index_incompatible == expected.index_incompatible && actual.old_schema_type_ids_changed == expected.old_schema_type_ids_changed && actual.schema_types_deleted_by_name == @@ -131,7 +130,12 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { actual.schema_types_incompatible_by_name == expected.schema_types_incompatible_by_name && actual.schema_types_incompatible_by_id == - expected.schema_types_incompatible_by_id) { + expected.schema_types_incompatible_by_id && + actual.schema_types_new_by_name == expected.schema_types_new_by_name && + actual.schema_types_changed_fully_compatible_by_name == + expected.schema_types_changed_fully_compatible_by_name && + actual.schema_types_index_incompatible_by_name == + expected.schema_types_index_incompatible_by_name) { return true; } @@ -191,37 +195,82 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { absl_ports::NumberFormatter()), "]"); + // Format schema_types_new_by_name + std::string actual_schema_types_new_by_name = absl_ports::StrCat( + "[", absl_ports::StrJoin(actual.schema_types_new_by_name, ","), "]"); + + std::string expected_schema_types_new_by_name = absl_ports::StrCat( + "[", absl_ports::StrJoin(expected.schema_types_new_by_name, ","), "]"); + + // Format schema_types_changed_fully_compatible_by_name + std::string actual_schema_types_changed_fully_compatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin( + actual.schema_types_changed_fully_compatible_by_name, ","), + "]"); + + std::string expected_schema_types_changed_fully_compatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin( + expected.schema_types_changed_fully_compatible_by_name, ","), + "]"); + + // Format schema_types_deleted_by_id + std::string actual_schema_types_index_incompatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin(actual.schema_types_index_incompatible_by_name, + ","), + "]"); + + std::string expected_schema_types_index_incompatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin(expected.schema_types_index_incompatible_by_name, + ","), + "]"); + *result_listener << IcingStringUtil::StringPrintf( "\nExpected {\n" "\tsuccess=%d,\n" - "\tindex_incompatible=%d,\n" "\told_schema_type_ids_changed=%s,\n" "\tschema_types_deleted_by_name=%s,\n" "\tschema_types_deleted_by_id=%s,\n" "\tschema_types_incompatible_by_name=%s,\n" "\tschema_types_incompatible_by_id=%s\n" + "\tschema_types_new_by_name=%s,\n" + "\tschema_types_index_incompatible_by_name=%s,\n" + "\tschema_types_changed_fully_compatible_by_name=%s\n" "}\n" "Actual {\n" "\tsuccess=%d,\n" - "\tindex_incompatible=%d,\n" "\told_schema_type_ids_changed=%s,\n" "\tschema_types_deleted_by_name=%s,\n" "\tschema_types_deleted_by_id=%s,\n" "\tschema_types_incompatible_by_name=%s,\n" "\tschema_types_incompatible_by_id=%s\n" + "\tschema_types_new_by_name=%s,\n" + "\tschema_types_index_incompatible_by_name=%s,\n" + "\tschema_types_changed_fully_compatible_by_name=%s\n" "}\n", - expected.success, expected.index_incompatible, - expected_old_schema_type_ids_changed.c_str(), + expected.success, expected_old_schema_type_ids_changed.c_str(), expected_schema_types_deleted_by_name.c_str(), expected_schema_types_deleted_by_id.c_str(), expected_schema_types_incompatible_by_name.c_str(), - expected_schema_types_incompatible_by_id.c_str(), actual.success, - actual.index_incompatible, actual_old_schema_type_ids_changed.c_str(), + expected_schema_types_incompatible_by_id.c_str(), + expected_schema_types_new_by_name.c_str(), + expected_schema_types_changed_fully_compatible_by_name.c_str(), + expected_schema_types_index_incompatible_by_name.c_str(), actual.success, + actual_old_schema_type_ids_changed.c_str(), actual_schema_types_deleted_by_name.c_str(), actual_schema_types_deleted_by_id.c_str(), actual_schema_types_incompatible_by_name.c_str(), - actual_schema_types_incompatible_by_id.c_str()); - + actual_schema_types_incompatible_by_id.c_str(), + actual_schema_types_new_by_name.c_str(), + actual_schema_types_changed_fully_compatible_by_name.c_str(), + actual_schema_types_index_incompatible_by_name.c_str()); return false; } diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index ae2a372..76219b5 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -99,11 +99,13 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return text_.substr(term_start_.utf8_index(), term_length); } - libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart() { + libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart() + override { return term_start_; } - libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive() { + libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive() + override { return term_end_exclusive_; } diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java index 2019033..0cee80c 100644 --- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java +++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java @@ -496,12 +496,13 @@ public final class IcingSearchEngineTest { assertStatusOk( icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus()); - // String: "我每天走路去上班。" - // ^ ^ ^ ^^ - // UTF16 idx: 0 1 3 5 6 - // Breaks into segments: "我", "每天", "走路", "去", "上班" - String chinese = "我每天走路去上班。"; - assertThat(chinese.length()).isEqualTo(9); + // String: "天是蓝的" + // ^ ^^ ^ + // UTF16 idx: 0 1 2 3 + // Breaks into segments: "天", "是", "蓝", "的" + // "The sky is blue" + String chinese = "天是蓝的"; + assertThat(chinese.length()).isEqualTo(4); DocumentProto emailDocument1 = createEmailDocument("namespace", "uri1").toBuilder() .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(chinese)) @@ -511,7 +512,7 @@ public final class IcingSearchEngineTest { // Search and request snippet matching but no windowing. SearchSpecProto searchSpec = SearchSpecProto.newBuilder() - .setQuery("每") + .setQuery("是") .setTermMatchType(TermMatchType.Code.PREFIX) .build(); ResultSpecProto resultSpecProto = @@ -550,9 +551,9 @@ public final class IcingSearchEngineTest { int matchStart = matchProto.getExactMatchUtf16Position(); int matchEnd = matchStart + matchProto.getExactMatchUtf16Length(); assertThat(matchStart).isEqualTo(1); - assertThat(matchEnd).isEqualTo(3); + assertThat(matchEnd).isEqualTo(2); String match = content.substring(matchStart, matchEnd); - assertThat(match).isEqualTo("每天"); + assertThat(match).isEqualTo("是"); } @Test diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto index 9a4e5b9..2e8321b 100644 --- a/proto/icing/proto/document.proto +++ b/proto/icing/proto/document.proto @@ -209,7 +209,7 @@ message DeleteBySchemaTypeResultProto { } // Result of a call to IcingSearchEngine.DeleteByQuery -// Next tag: 3 +// Next tag: 4 message DeleteByQueryResultProto { // Status code can be one of: // OK @@ -224,5 +224,7 @@ message DeleteByQueryResultProto { optional StatusProto status = 1; // Stats for delete execution performance. - optional DeleteStatsProto delete_stats = 2; + optional DeleteByQueryStatsProto delete_by_query_stats = 3; + + reserved 2; } diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto index 29f7f80..4dcfecf 100644 --- a/proto/icing/proto/logging.proto +++ b/proto/icing/proto/logging.proto @@ -181,8 +181,7 @@ message QueryStatsProto { } // Stats of the top-level functions IcingSearchEngine::Delete, -// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType, -// IcingSearchEngine::DeleteByQuery. +// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType. // Next tag: 4 message DeleteStatsProto { // Overall time used for the function call. @@ -196,8 +195,10 @@ message DeleteStatsProto { // Delete one document. SINGLE = 1; - // Delete by query. - QUERY = 2; + // Delete by query. This value is deprecated. + // IcingSearchEngine::DeleteByQuery will return a DeleteByQueryStatsProto + // rather than a DeleteStatsProto. + DEPRECATED_QUERY = 2 [deprecated = true]; // Delete by namespace. NAMESPACE = 3; @@ -211,3 +212,32 @@ message DeleteStatsProto { // Number of documents deleted by this call. optional int32 num_documents_deleted = 3; } + +// Stats of the top-level functions IcingSearchEngine::DeleteByQuery. +// Next tag: 9 +message DeleteByQueryStatsProto { + // Overall time used for the function call. + optional int32 latency_ms = 1; + + // Number of documents deleted by this call. + optional int32 num_documents_deleted = 2; + + // The UTF-8 length of the query string + optional int32 query_length = 3; + + // Number of terms in the query string. + optional int32 num_terms = 4; + + // Number of namespaces filtered. + optional int32 num_namespaces_filtered = 5; + + // Number of schema types filtered. + optional int32 num_schema_types_filtered = 6; + + // Time used to parse the query, including 2 parts: tokenizing and + // transforming tokens into an iterator tree. + optional int32 parse_query_latency_ms = 7; + + // Time used to delete each document. + optional int32 document_removal_latency_ms = 8; +} diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto index 4188a8c..c611cbf 100644 --- a/proto/icing/proto/schema.proto +++ b/proto/icing/proto/schema.proto @@ -197,7 +197,7 @@ message SchemaProto { } // Result of a call to IcingSearchEngine.SetSchema -// Next tag: 4 +// Next tag: 8 message SetSchemaResultProto { // Status code can be one of: // OK @@ -221,6 +221,21 @@ message SetSchemaResultProto { // documents that fail validation against the new schema types would also be // deleted. repeated string incompatible_schema_types = 3; + + // Schema types that did not exist in the previous schema and were added with + // the new schema type. + repeated string new_schema_types = 4; + + // Schema types that were changed in a way that was backwards compatible and + // didn't invalidate the index. + repeated string fully_compatible_changed_schema_types = 5; + + // Schema types that were changed in a way that was backwards compatible, but + // invalidated the index. + repeated string index_incompatible_changed_schema_types = 6; + + // Overall time used for the function call. + optional int32 latency_ms = 7; } // Result of a call to IcingSearchEngine.GetSchema diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 35ad6d9..a431b89 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=378695940) +set(synced_AOSP_CL_number=380898610) |