diff options
Diffstat (limited to 'icing')
38 files changed, 1853 insertions, 680 deletions
diff --git a/icing/absl_ports/str_join.cc b/icing/absl_ports/str_join.cc new file mode 100644 index 0000000..2d105ca --- /dev/null +++ b/icing/absl_ports/str_join.cc @@ -0,0 +1,41 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/absl_ports/str_join.h" + +namespace icing { +namespace lib { +namespace absl_ports { + +std::vector<std::string_view> StrSplit(std::string_view text, + std::string_view sep) { + std::vector<std::string_view> substrings; + size_t separator_position = text.find(sep); + size_t current_start = 0; + size_t current_end = separator_position; + while (separator_position != std::string_view::npos) { + substrings.push_back( + text.substr(current_start, current_end - current_start)); + current_start = current_end + sep.length(); + separator_position = text.find(sep, current_start); + current_end = separator_position; + } + current_end = text.length(); + substrings.push_back(text.substr(current_start, current_end - current_start)); + return substrings; +} + +} // namespace absl_ports +} // namespace lib +} // namespace icing diff --git a/icing/absl_ports/str_join.h b/icing/absl_ports/str_join.h index 7c8936a..f66a977 100644 --- a/icing/absl_ports/str_join.h +++ b/icing/absl_ports/str_join.h @@ -17,6 +17,7 @@ #include <string> #include <string_view> +#include <vector> #include "icing/absl_ports/str_cat.h" @@ -104,6 +105,9 @@ std::string StrJoin(const Container& container, std::string_view sep) { return absl_ports::StrJoin(container, sep, DefaultFormatter()); } +std::vector<std::string_view> StrSplit(std::string_view text, + std::string_view sep); + } // namespace absl_ports } // namespace lib } // namespace icing diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h index 95511ac..aa5a031 100644 --- a/icing/file/file-backed-proto-log.h +++ b/icing/file/file-backed-proto-log.h @@ -168,11 +168,15 @@ class FileBackedProtoLog { // A successfully initialized log. std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log; - // Whether there was some data loss while initializing from a previous - // state. This can happen if the file is corrupted or some previously added - // data was unpersisted. This may be used to signal that any derived data - // off of the proto log may need to be regenerated. - bool data_loss; + // The data status after initializing from a previous state. Data loss can + // happen if the file is corrupted or some previously added data was + // unpersisted. This may be used to signal that any derived data off of the + // proto log may need to be regenerated. + enum DataStatus { NO_DATA_LOSS, PARTIAL_LOSS, COMPLETE_LOSS } data_status; + + bool has_data_loss() { + return data_status == PARTIAL_LOSS || data_status == COMPLETE_LOSS; + } }; // Factory method to create, initialize, and return a FileBackedProtoLog. Will @@ -182,10 +186,11 @@ class FileBackedProtoLog { // added data was unpersisted, the log will rewind to the last-good state. The // log saves these checkpointed "good" states when PersistToDisk() is called // or the log is safely destructed. If the log rewinds successfully to the - // last-good state, then the returned CreateResult.data_loss indicates - // there was some data loss so that any derived data may know that it - // needs to be updated. If the log re-initializes successfully without any - // data loss, the boolean will be false. + // last-good state, then the returned CreateResult.data_status indicates + // whether it has a data loss and what kind of data loss it is (partial or + // complete) so that any derived data may know that it needs to be updated. If + // the log re-initializes successfully without any data loss, + // CreateResult.data_status will be NO_DATA_LOSS. // // Params: // filesystem: Handles system level calls @@ -511,7 +516,7 @@ FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem, std::unique_ptr<FileBackedProtoLog<ProtoT>>( new FileBackedProtoLog<ProtoT>(filesystem, file_path, std::move(header))), - /*data_loss=*/false}; + /*data_status=*/CreateResult::NO_DATA_LOSS}; return create_result; } @@ -561,15 +566,14 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem, } header->max_proto_size = options.max_proto_size; - bool data_loss = false; + typename CreateResult::DataStatus data_status = CreateResult::NO_DATA_LOSS; ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum, ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header), file_size)); // Double check that the log checksum is the same as the one that was // persisted last time. If not, we start recovery logic. if (header->log_checksum != calculated_log_checksum.Get()) { - // Need to rewind the proto log since the checksums don't match - data_loss = true; + // Need to rewind the proto log since the checksums don't match. // Worst case, we have to rewind the entire log back to just the header int64_t last_known_good = sizeof(Header); @@ -585,10 +589,12 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem, // Check if it matches our last rewind state. If so, this becomes our last // good state and we can safely truncate and recover from here. last_known_good = header->rewind_offset; + data_status = CreateResult::PARTIAL_LOSS; } else { // Otherwise, we're going to truncate the entire log and this resets the // checksum to an empty log state. header->log_checksum = 0; + data_status = CreateResult::COMPLETE_LOSS; } if (!filesystem->Truncate(file_path.c_str(), last_known_good)) { @@ -604,7 +610,7 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem, std::unique_ptr<FileBackedProtoLog<ProtoT>>( new FileBackedProtoLog<ProtoT>(filesystem, file_path, std::move(header))), - data_loss}; + data_status}; return create_result; } diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc index fad5248..7410d2b 100644 --- a/icing/file/file-backed-proto-log_test.cc +++ b/icing/file/file-backed-proto-log_test.cc @@ -77,7 +77,7 @@ TEST_F(FileBackedProtoLogTest, Initialize) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); EXPECT_THAT(create_result.proto_log, NotNull()); - EXPECT_FALSE(create_result.data_loss); + EXPECT_FALSE(create_result.has_data_loss()); // Can't recreate the same file with different options. ASSERT_THAT(FileBackedProtoLog<DocumentProto>::Create( @@ -96,7 +96,7 @@ TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); @@ -113,7 +113,7 @@ TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Write a proto DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); @@ -147,7 +147,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) { FileBackedProtoLog<DocumentProto>::Options( /*compress_in=*/false, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Write the first proto DocumentProto document1 = @@ -194,7 +194,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) { FileBackedProtoLog<DocumentProto>::Options( /*compress_in=*/false, max_proto_size_))); auto recreated_proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Write a third proto DocumentProto document3 = @@ -216,7 +216,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) { FileBackedProtoLog<DocumentProto>::Options( /*compress_in=*/true, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Write the first proto DocumentProto document1 = @@ -263,7 +263,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) { FileBackedProtoLog<DocumentProto>::Options( /*compress_in=*/true, max_proto_size_))); auto recreated_proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Write a third proto DocumentProto document3 = @@ -283,7 +283,7 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto recreated_proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + EXPECT_FALSE(create_result.has_data_loss()); int corrupt_offset = offsetof(FileBackedProtoLog<DocumentProto>::Header, rewind_offset); @@ -312,7 +312,7 @@ TEST_F(FileBackedProtoLogTest, CorruptContent) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + EXPECT_FALSE(create_result.has_data_loss()); DocumentProto document = DocumentBuilder().SetKey("namespace1", "uri1").Build(); @@ -338,7 +338,7 @@ TEST_F(FileBackedProtoLogTest, CorruptContent) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_TRUE(create_result.data_loss); + ASSERT_TRUE(create_result.has_data_loss()); // Lost everything in the log since the rewind position doesn't help if // there's been data corruption within the persisted region @@ -363,7 +363,7 @@ TEST_F(FileBackedProtoLogTest, PersistToDisk) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Write and persist the first proto ICING_ASSERT_OK_AND_ASSIGN(document1_offset, @@ -407,7 +407,7 @@ TEST_F(FileBackedProtoLogTest, PersistToDisk) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_TRUE(create_result.data_loss); + ASSERT_TRUE(create_result.has_data_loss()); // Check that everything was persisted across instances ASSERT_THAT(proto_log->ReadProto(document1_offset), @@ -433,7 +433,7 @@ TEST_F(FileBackedProtoLogTest, Iterator) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); { // Empty iterator @@ -484,7 +484,7 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); ICING_EXPECT_OK(proto_log->WriteProto(document)); @@ -502,7 +502,7 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Checksum should be consistent across instances EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); @@ -528,7 +528,7 @@ TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Writes and erases proto ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, @@ -561,7 +561,7 @@ TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Writes 2 protos ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, @@ -603,7 +603,7 @@ TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Writes 3 protos ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, @@ -631,7 +631,7 @@ TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Erases the 2nd proto that is now before the rewind position. Checksum is // updated. @@ -651,7 +651,7 @@ TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - ASSERT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.has_data_loss()); // Append a new document which is after the rewind position. ICING_ASSERT_OK(proto_log->WriteProto(document4)); @@ -673,7 +673,7 @@ TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + EXPECT_FALSE(create_result.has_data_loss()); } } diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc index 32ac9e6..1cb8620 100644 --- a/icing/icing-search-engine-with-icu-file_test.cc +++ b/icing/icing-search-engine-with-icu-file_test.cc @@ -63,9 +63,10 @@ SchemaProto CreateMessageSchema() { body->set_property_name("body"); body->set_data_type(PropertyConfigProto::DataType::STRING); body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); return schema; } diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index fdec473..08ceafd 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -35,6 +35,7 @@ #include "icing/legacy/index/icing-filesystem.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" +#include "icing/proto/logging.pb.h" #include "icing/proto/optimize.pb.h" #include "icing/proto/persist.pb.h" #include "icing/proto/reset.pb.h" @@ -59,6 +60,7 @@ #include "icing/util/crc32.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" +#include "icing/util/timer.h" #include "unicode/uloc.h" namespace icing { @@ -150,6 +152,10 @@ std::string MakeSchemaDirectoryPath(const std::string& base_dir) { void TransformStatus(const libtextclassifier3::Status& internal_status, StatusProto* status_proto) { StatusProto::Code code; + if (!internal_status.ok()) { + ICING_LOG(WARNING) << "Error: " << internal_status.error_code() + << ", Message: " << internal_status.error_message(); + } switch (internal_status.CanonicalCode()) { case libtextclassifier3::StatusCode::OK: code = StatusProto::OK; @@ -257,45 +263,101 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() { ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: " << options_.base_dir(); + // Measure the latency of the initialization process. + Timer initialize_timer; + InitializeResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); + NativeInitializeStats* initialize_stats = + result_proto.mutable_native_initialize_stats(); if (initialized_) { // Already initialized. result_status->set_code(StatusProto::OK); + initialize_stats->set_latency_ms(initialize_timer.GetElapsedMilliseconds()); + initialize_stats->set_num_documents(document_store_->num_documents()); return result_proto; } // Releases result / query cache if any result_state_manager_.InvalidateAllResultStates(); - libtextclassifier3::Status status = InitializeMembers(); + libtextclassifier3::Status status = InitializeMembers(initialize_stats); if (!status.ok()) { TransformStatus(status, result_status); + initialize_stats->set_latency_ms(initialize_timer.GetElapsedMilliseconds()); return result_proto; } // Even if each subcomponent initialized fine independently, we need to // check if they're consistent with each other. if (!CheckConsistency().ok()) { - ICING_VLOG(1) - << "IcingSearchEngine in inconsistent state, regenerating all " - "derived data"; - status = RegenerateDerivedFiles(); + // The total checksum doesn't match the stored value, it could be one of the + // following cases: + // 1. Icing is initialized the first time in this directory. + // 2. Non-checksumed changes have been made to some files. + if (index_->last_added_document_id() == kInvalidDocumentId && + document_store_->last_added_document_id() == kInvalidDocumentId && + absl_ports::IsNotFound(schema_store_->GetSchema().status())) { + // First time initialize. Not recovering but creating all the files. + // We need to explicitly clear the recovery-related fields because some + // sub-components may not be able to tell if the storage is being + // initialized the first time or has lost some files. Sub-components may + // already have set these fields in earlier steps. + *initialize_stats = NativeInitializeStats(); + status = RegenerateDerivedFiles(); + } else { + ICING_VLOG(1) + << "IcingSearchEngine in inconsistent state, regenerating all " + "derived data"; + // Total checksum mismatch may not be the root cause of document store + // recovery. Preserve the root cause that was set by the document store. + bool should_log_document_store_recovery_cause = + initialize_stats->document_store_recovery_cause() == + NativeInitializeStats::NONE; + if (should_log_document_store_recovery_cause) { + initialize_stats->set_document_store_recovery_cause( + NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH); + } + initialize_stats->set_index_restoration_cause( + NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH); + status = RegenerateDerivedFiles(initialize_stats, + should_log_document_store_recovery_cause); + } } else { - status = RestoreIndex(); + DocumentId last_stored_document_id = + document_store_->last_added_document_id(); + DocumentId last_indexed_document_id = index_->last_added_document_id(); + if (last_stored_document_id != last_indexed_document_id) { + if (last_stored_document_id == kInvalidDocumentId) { + // Document store is empty but index is not. Reset the index. + status = index_->Reset(); + } else { + // Index is inconsistent with the document store, we need to restore the + // index. + initialize_stats->set_index_restoration_cause( + NativeInitializeStats::INCONSISTENT_WITH_GROUND_TRUTH); + Timer index_restore_timer; + status = RestoreIndexIfNeeded(); + initialize_stats->set_index_restoration_latency_ms( + index_restore_timer.GetElapsedMilliseconds()); + } + } } if (status.ok() || absl_ports::IsDataLoss(status)) { initialized_ = true; } TransformStatus(status, result_status); + initialize_stats->set_latency_ms(initialize_timer.GetElapsedMilliseconds()); return result_proto; } -libtextclassifier3::Status IcingSearchEngine::InitializeMembers() { +libtextclassifier3::Status IcingSearchEngine::InitializeMembers( + NativeInitializeStats* initialize_stats) { + ICING_RETURN_ERROR_IF_NULL(initialize_stats); ICING_RETURN_IF_ERROR(InitializeOptions()); - ICING_RETURN_IF_ERROR(InitializeSchemaStore()); - ICING_RETURN_IF_ERROR(InitializeDocumentStore()); + ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats)); + ICING_RETURN_IF_ERROR(InitializeDocumentStore(initialize_stats)); // TODO(b/156383798) : Resolve how to specify the locale. language_segmenter_factory::SegmenterOptions segmenter_options( @@ -306,7 +368,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers() { TC3_ASSIGN_OR_RETURN(normalizer_, normalizer_factory::Create(options_.max_token_length())); - ICING_RETURN_IF_ERROR(InitializeIndex()); + ICING_RETURN_IF_ERROR(InitializeIndex(initialize_stats)); return libtextclassifier3::Status::OK; } @@ -323,7 +385,10 @@ libtextclassifier3::Status IcingSearchEngine::InitializeOptions() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore() { +libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore( + NativeInitializeStats* initialize_stats) { + ICING_RETURN_ERROR_IF_NULL(initialize_stats); + const std::string schema_store_dir = MakeSchemaDirectoryPath(options_.base_dir()); // Make sure the sub-directory exists @@ -332,12 +397,16 @@ libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore() { absl_ports::StrCat("Could not create directory: ", schema_store_dir)); } ICING_ASSIGN_OR_RETURN( - schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir)); + schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir, + initialize_stats)); return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore() { +libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore( + NativeInitializeStats* initialize_stats) { + ICING_RETURN_ERROR_IF_NULL(initialize_stats); + const std::string document_dir = MakeDocumentDirectoryPath(options_.base_dir()); // Make sure the sub-directory exists @@ -348,12 +417,15 @@ libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore() { ICING_ASSIGN_OR_RETURN( document_store_, DocumentStore::Create(filesystem_.get(), document_dir, clock_.get(), - schema_store_.get())); + schema_store_.get(), initialize_stats)); return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IcingSearchEngine::InitializeIndex() { +libtextclassifier3::Status IcingSearchEngine::InitializeIndex( + NativeInitializeStats* initialize_stats) { + ICING_RETURN_ERROR_IF_NULL(initialize_stats); + const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir()); // Make sure the sub-directory exists if (!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) { @@ -371,11 +443,18 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex() { absl_ports::StrCat("Could not recreate directory: ", index_dir)); } + initialize_stats->set_index_restoration_cause( + NativeInitializeStats::IO_ERROR); + // Try recreating it from scratch and re-indexing everything. ICING_ASSIGN_OR_RETURN(index_, Index::Create(index_options, filesystem_.get(), icing_filesystem_.get())); - ICING_RETURN_IF_ERROR(RestoreIndex()); + + Timer restore_timer; + ICING_RETURN_IF_ERROR(RestoreIndexIfNeeded()); + initialize_stats->set_index_restoration_latency_ms( + restore_timer.GetElapsedMilliseconds()); } else { // Index was created fine. index_ = std::move(index_or).ValueOrDie(); @@ -414,11 +493,25 @@ libtextclassifier3::Status IcingSearchEngine::CheckConsistency() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IcingSearchEngine::RegenerateDerivedFiles() { +libtextclassifier3::Status IcingSearchEngine::RegenerateDerivedFiles( + NativeInitializeStats* initialize_stats, bool log_document_store_stats) { + // Measure the latency of the data recovery. The cause of the recovery should + // be logged by the caller. + Timer timer; ICING_RETURN_IF_ERROR( document_store_->UpdateSchemaStore(schema_store_.get())); + if (initialize_stats != nullptr && log_document_store_stats) { + initialize_stats->set_document_store_recovery_latency_ms( + timer.GetElapsedMilliseconds()); + } + // Restart timer. + timer = Timer(); ICING_RETURN_IF_ERROR(index_->Reset()); - ICING_RETURN_IF_ERROR(RestoreIndex()); + ICING_RETURN_IF_ERROR(RestoreIndexIfNeeded()); + if (initialize_stats != nullptr) { + initialize_stats->set_index_restoration_latency_ms( + timer.GetElapsedMilliseconds()); + } const std::string header_file = MakeHeaderFilename(options_.base_dir().c_str()); @@ -513,7 +606,7 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( return result_proto; } - status = RestoreIndex(); + status = RestoreIndexIfNeeded(); if (!status.ok()) { TransformStatus(status, result_status); return result_proto; @@ -580,8 +673,12 @@ PutResultProto IcingSearchEngine::Put(const DocumentProto& document) { PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { ICING_VLOG(1) << "Writing document to document store"; + Timer put_timer; + PutResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); + NativePutDocumentStats* put_document_stats = + result_proto.mutable_native_put_document_stats(); // Lock must be acquired before validation because the DocumentStore uses // the schema file to validate, and the schema could be changed in @@ -590,12 +687,14 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { if (!initialized_) { result_status->set_code(StatusProto::FAILED_PRECONDITION); result_status->set_message("IcingSearchEngine has not been initialized!"); + put_document_stats->set_latency_ms(put_timer.GetElapsedMilliseconds()); return result_proto; } - auto document_id_or = document_store_->Put(document); + auto document_id_or = document_store_->Put(document, put_document_stats); if (!document_id_or.ok()) { TransformStatus(document_id_or.status(), result_status); + put_document_stats->set_latency_ms(put_timer.GetElapsedMilliseconds()); return result_proto; } DocumentId document_id = document_id_or.ValueOrDie(); @@ -605,13 +704,17 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { index_.get(), CreateIndexProcessorOptions(options_)); if (!index_processor_or.ok()) { TransformStatus(index_processor_or.status(), result_status); + put_document_stats->set_latency_ms(put_timer.GetElapsedMilliseconds()); return result_proto; } std::unique_ptr<IndexProcessor> index_processor = std::move(index_processor_or).ValueOrDie(); - auto status = index_processor->IndexDocument(document, document_id); + auto status = + index_processor->IndexDocument(document, document_id, put_document_stats); + TransformStatus(status, result_status); + put_document_stats->set_latency_ms(put_timer.GetElapsedMilliseconds()); return result_proto; } @@ -889,7 +992,7 @@ OptimizeResultProto IcingSearchEngine::Optimize() { return result_proto; } - libtextclassifier3::Status index_restoration_status = RestoreIndex(); + libtextclassifier3::Status index_restoration_status = RestoreIndexIfNeeded(); if (!index_restoration_status.ok()) { status = absl_ports::Annotate( absl_ports::InternalError( @@ -1300,19 +1403,21 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IcingSearchEngine::RestoreIndex() { +libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() { DocumentId last_stored_document_id = document_store_->last_added_document_id(); DocumentId last_indexed_document_id = index_->last_added_document_id(); - if (last_stored_document_id == kInvalidDocumentId) { - // Nothing to index. Make sure the index is also empty. - if (last_indexed_document_id != kInvalidDocumentId) { - ICING_RETURN_IF_ERROR(index_->Reset()); - } + if (last_stored_document_id == last_indexed_document_id) { + // No need to recover. return libtextclassifier3::Status::OK; } + if (last_stored_document_id == kInvalidDocumentId) { + // Document store is empty but index is not. Reset the index. + return index_->Reset(); + } + // TruncateTo ensures that the index does not hold any data that is not // present in the ground truth. If the document store lost some documents, // TruncateTo will ensure that the index does not contain any hits from those @@ -1323,7 +1428,7 @@ libtextclassifier3::Status IcingSearchEngine::RestoreIndex() { DocumentId first_document_to_reindex = (last_indexed_document_id != kInvalidDocumentId) ? index_->last_added_document_id() + 1 - : 0; + : kMinDocumentId; ICING_ASSIGN_OR_RETURN( std::unique_ptr<IndexProcessor> index_processor, diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index 58b8df2..70a9c07 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -451,10 +451,12 @@ class IcingSearchEngine { // // Returns: // OK on success + // FAILED_PRECONDITION if initialize_stats is null // RESOURCE_EXHAUSTED if the index runs out of storage // NOT_FOUND if some Document's schema type is not in the SchemaStore // INTERNAL on any I/O errors - libtextclassifier3::Status InitializeMembers() + libtextclassifier3::Status InitializeMembers( + NativeInitializeStats* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Do any validation/setup required for the given IcingSearchEngineOptions @@ -470,8 +472,10 @@ class IcingSearchEngine { // // Returns: // OK on success + // FAILED_PRECONDITION if initialize_stats is null // INTERNAL on I/O error - libtextclassifier3::Status InitializeSchemaStore() + libtextclassifier3::Status InitializeSchemaStore( + NativeInitializeStats* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Do any initialization/recovery necessary to create a DocumentStore @@ -479,8 +483,10 @@ class IcingSearchEngine { // // Returns: // OK on success + // FAILED_PRECONDITION if initialize_stats is null // INTERNAL on I/O error - libtextclassifier3::Status InitializeDocumentStore() + libtextclassifier3::Status InitializeDocumentStore( + NativeInitializeStats* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Do any initialization/recovery necessary to create a DocumentStore @@ -488,10 +494,12 @@ class IcingSearchEngine { // // Returns: // OK on success + // FAILED_PRECONDITION if initialize_stats is null // RESOURCE_EXHAUSTED if the index runs out of storage // NOT_FOUND if some Document's schema type is not in the SchemaStore // INTERNAL on I/O error - libtextclassifier3::Status InitializeIndex() + libtextclassifier3::Status InitializeIndex( + NativeInitializeStats* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Many of the internal components rely on other components' derived data. @@ -514,7 +522,9 @@ class IcingSearchEngine { // Returns: // OK on success // INTERNAL_ERROR on any IO errors - libtextclassifier3::Status RegenerateDerivedFiles() + libtextclassifier3::Status RegenerateDerivedFiles( + NativeInitializeStats* initialize_stats = nullptr, + bool log_document_store_stats = false) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Optimizes the DocumentStore by removing any unneeded documents (i.e. @@ -543,7 +553,7 @@ class IcingSearchEngine { // RESOURCE_EXHAUSTED if the index fills up before finishing indexing // NOT_FOUND if some Document's schema type is not in the SchemaStore // INTERNAL_ERROR on any IO errors - libtextclassifier3::Status RestoreIndex() + libtextclassifier3::Status RestoreIndexIfNeeded() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Computes the combined checksum of the IcingSearchEngine - includes all its diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc index d31f836..1f59c6e 100644 --- a/icing/icing-search-engine_fuzz_test.cc +++ b/icing/icing-search-engine_fuzz_test.cc @@ -44,9 +44,10 @@ SchemaProto SetTypes() { body->set_property_name("body"); body->set_data_type(PropertyConfigProto::DataType::STRING); body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); return schema; } diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index 06e89f2..b642a94 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -41,6 +41,7 @@ #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/jni-test-helpers.h" +#include "icing/testing/random-string.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -161,9 +162,10 @@ SchemaProto CreateMessageSchema() { body->set_property_name("body"); body->set_data_type(PropertyConfigProto::DataType::STRING); body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); return schema; } @@ -177,16 +179,18 @@ SchemaProto CreateEmailSchema() { body->set_property_name("body"); body->set_data_type(PropertyConfigProto::DataType::STRING); body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); auto* subj = type->add_properties(); subj->set_property_name("subject"); subj->set_data_type(PropertyConfigProto::DataType::STRING); subj->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - subj->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX); - subj->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + subj->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + subj->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); return schema; } @@ -423,10 +427,7 @@ TEST_F(IcingSearchEngineTest, body->set_schema_type("Person"); body->set_data_type(PropertyConfigProto::DataType::DOCUMENT); body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_document_indexing_config()->set_index_nested_properties(true); type = schema.add_types(); type->set_schema_type("Person"); @@ -436,10 +437,7 @@ TEST_F(IcingSearchEngineTest, body->set_schema_type("Message"); body->set_data_type(PropertyConfigProto::DataType::DOCUMENT); body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_document_indexing_config()->set_index_nested_properties(true); IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -735,7 +733,7 @@ TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { SchemaProto schema_with_no_indexed_property = CreateMessageSchema(); schema_with_no_indexed_property.mutable_types(0) ->mutable_properties(0) - ->clear_indexing_config(); + ->clear_string_indexing_config(); EXPECT_THAT(icing.SetSchema(schema_with_no_indexed_property).status(), ProtoIsOk()); @@ -1775,10 +1773,10 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { property->set_property_name("subject"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); // Add an message type type = schema.add_types(); type->set_schema_type("message"); @@ -1786,10 +1784,10 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { property->set_property_name("body"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); DocumentProto document1 = DocumentBuilder() .SetKey("namespace1", "uri1") @@ -2205,20 +2203,20 @@ TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) { new_property2->set_property_name("property2"); new_property2->set_data_type(PropertyConfigProto::DataType::STRING); new_property2->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - new_property2->mutable_indexing_config()->set_term_match_type( + new_property2->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - new_property2->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + new_property2->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); SchemaProto schema3 = SchemaProto(schema2); auto new_property3 = schema3.mutable_types(0)->add_properties(); new_property3->set_property_name("property3"); new_property3->set_data_type(PropertyConfigProto::DataType::STRING); new_property3->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - new_property3->mutable_indexing_config()->set_term_match_type( + new_property3->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - new_property3->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + new_property3->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -2463,9 +2461,10 @@ TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) { body->set_property_name("body"); body->set_data_type(PropertyConfigProto::DataType::STRING); body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); DocumentProto document = DocumentBuilder() .SetKey("namespace", "uri") @@ -2513,9 +2512,10 @@ TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) { body->set_property_name("body"); body->set_data_type(PropertyConfigProto::DataType::STRING); body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); DocumentProto document = DocumentBuilder() .SetKey("namespace", "uri") @@ -2613,10 +2613,10 @@ TEST_F(IcingSearchEngineTest, SearchWorksAfterSchemaTypesCompatiblyModified) { property->set_property_name("body"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); @@ -2869,10 +2869,10 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) { property->set_property_name("body"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); property = type->add_properties(); property->set_property_name("additional"); @@ -2916,19 +2916,19 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) { property->set_property_name("body"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); property = type->add_properties(); property->set_property_name("additional"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -3705,9 +3705,10 @@ TEST_F(IcingSearchEngineTest, SetSchemaCanDetectPreviousSchemaWasLost) { body->set_property_name("body"); body->set_data_type(PropertyConfigProto::DataType::STRING); body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); // Make an incompatible schema, a previously OPTIONAL field is REQUIRED SchemaProto incompatible_schema = schema; @@ -4218,10 +4219,10 @@ TEST_F(IcingSearchEngineTest, Hyphens) { prop->set_property_name("foo"); prop->set_data_type(PropertyConfigProto::DataType::STRING); prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - prop->mutable_indexing_config()->set_term_match_type( + prop->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + prop->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); DocumentProto document_one = @@ -4281,7 +4282,7 @@ TEST_F(IcingSearchEngineTest, RestoreIndex) { EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); } - // 2. Delete the index file to trigger RestoreIndex. + // 2. Delete the index file to trigger RestoreIndexIfNeeded. std::string idx_subdir = GetIndexDir() + "/idx"; filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str()); @@ -4474,7 +4475,7 @@ TEST_F(IcingSearchEngineTest, IndexingDocMergeFailureResets) { EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); } - // 2. Delete the index file to trigger RestoreIndex. + // 2. Delete the index file to trigger RestoreIndexIfNeeded. std::string idx_subdir = GetIndexDir() + "/idx"; filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str()); @@ -4522,6 +4523,691 @@ TEST_F(IcingSearchEngineTest, IndexingDocMergeFailureResets) { } } +TEST_F(IcingSearchEngineTest, InitializeShouldLogFunctionLatency) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats().latency_ms(), + Gt(0)); +} + +TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfDocuments) { + DocumentProto document1 = DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("icing", "fake_type/2") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + + { + // Initialize and put a document. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + initialize_result_proto.native_initialize_stats().num_documents(), + Eq(0)); + + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + } + + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + initialize_result_proto.native_initialize_stats().num_documents(), + Eq(1)); + + // Put another document. + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + } + + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + initialize_result_proto.native_initialize_stats().num_documents(), + Eq(2)); + } +} + +TEST_F(IcingSearchEngineTest, + InitializeShouldNotLogRecoveryCauseForFirstTimeInitialize) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_latency_ms(), + Eq(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_data_status(), + Eq(NativeInitializeStats::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_latency_ms(), + Eq(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); +} + +TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCausePartialDataLoss) { + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + + { + // Initialize and put a document. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + + { + // Append a non-checksummed document. This will mess up the checksum of the + // proto log, forcing it to rewind and later return a DATA_LOSS error. + const std::string serialized_document = document.SerializeAsString(); + const std::string document_log_file = + absl_ports::StrCat(GetDocumentDir(), "/document_log"); + + int64_t file_size = filesystem()->GetFileSize(document_log_file.c_str()); + filesystem()->PWrite(document_log_file.c_str(), file_size, + serialized_document.data(), + serialized_document.size()); + } + + { + // Document store will rewind to previous checkpoint. The cause should be + // DATA_LOSS and the data status should be PARTIAL_LOSS. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_cause(), + Eq(NativeInitializeStats::DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_latency_ms(), + Gt(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_data_status(), + Eq(NativeInitializeStats::PARTIAL_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_latency_ms(), + Eq(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); + } +} + +TEST_F(IcingSearchEngineTest, + InitializeShouldLogRecoveryCauseCompleteDataLoss) { + DocumentProto document1 = DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("icing", "fake_type/2") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .Build(); + + { + // Initialize and put a document. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(document2).status(), ProtoIsOk()); + } + + { + // Modify the document log checksum to trigger a complete document log + // rewind. + const std::string document_log_file = + absl_ports::StrCat(GetDocumentDir(), "/document_log"); + + FileBackedProtoLog<DocumentWrapper>::Header document_log_header; + filesystem()->PRead(document_log_file.c_str(), &document_log_header, + sizeof(FileBackedProtoLog<DocumentWrapper>::Header), + /*offset=*/0); + // Set a garbage checksum. + document_log_header.log_checksum = 10; + document_log_header.header_checksum = + document_log_header.CalculateHeaderChecksum(); + filesystem()->PWrite(document_log_file.c_str(), /*offset=*/0, + &document_log_header, + sizeof(FileBackedProtoLog<DocumentWrapper>::Header)); + } + + { + // Document store will completely rewind. The cause should be DATA_LOSS and + // the data status should be COMPLETE_LOSS. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_cause(), + Eq(NativeInitializeStats::DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_latency_ms(), + Gt(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_data_status(), + Eq(NativeInitializeStats::COMPLETE_LOSS)); + // The complete rewind of ground truth causes the mismatch of total + // checksum, so index should be restored. + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_cause(), + Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH)); + // Here we don't check index_restoration_latency_ms because the index + // restoration is super fast when document store is emtpy. We won't get a + // latency that is greater than 1 ms. + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); + } +} + +TEST_F(IcingSearchEngineTest, + InitializeShouldLogRecoveryCauseInconsistentWithGroundTruth) { + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + { + // Initialize and put a document. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + + { + // Delete the index file to trigger RestoreIndexIfNeeded. + std::string idx_subdir = GetIndexDir() + "/idx"; + filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str()); + } + + { + // Index is empty but ground truth is not. Index should be restored due to + // the inconsistency. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_cause(), + Eq(NativeInitializeStats::INCONSISTENT_WITH_GROUND_TRUTH)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_latency_ms(), + Gt(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_latency_ms(), + Eq(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_data_status(), + Eq(NativeInitializeStats::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); + } +} + +TEST_F(IcingSearchEngineTest, + InitializeShouldLogRecoveryCauseTotalChecksumMismatch) { + { + // Initialize and index some documents. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // We need to index enough documents to make + // DocumentStore::UpdateSchemaStore() run longer than 1 ms. + for (int i = 0; i < 50; ++i) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/" + std::to_string(i)) + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + } + + { + // Change the header's checksum value to a random value. + uint32_t invalid_checksum = 1; + filesystem()->PWrite(GetHeaderFilename().c_str(), + offsetof(IcingSearchEngine::Header, checksum), + &invalid_checksum, sizeof(invalid_checksum)); + } + + { + // Both document store and index should be recovered from checksum mismatch. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_cause(), + Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_latency_ms(), + Gt(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_cause(), + Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_latency_ms(), + Gt(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_data_status(), + Eq(NativeInitializeStats::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); + } +} + +TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCauseIndexIOError) { + { + // Initialize and index some documents. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // We need to index enough documents to make RestoreIndexIfNeeded() run + // longer than 1 ms. + for (int i = 0; i < 50; ++i) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/" + std::to_string(i)) + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + } + + // lambda to fail OpenForWrite on lite index hit buffer once. + bool has_failed_already = false; + auto open_write_lambda = [this, &has_failed_already](const char* filename) { + std::string lite_index_buffer_file_path = + absl_ports::StrCat(GetIndexDir(), "/idx/lite.hb"); + std::string filename_string(filename); + if (!has_failed_already && filename_string == lite_index_buffer_file_path) { + has_failed_already = true; + return -1; + } + return this->filesystem()->OpenForWrite(filename); + }; + + auto mock_icing_filesystem = std::make_unique<IcingMockFilesystem>(); + // This fails Index::Create() once. + ON_CALL(*mock_icing_filesystem, OpenForWrite) + .WillByDefault(open_write_lambda); + + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::move(mock_icing_filesystem), + std::make_unique<FakeClock>(), GetTestJniCache()); + + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_cause(), + Eq(NativeInitializeStats::IO_ERROR)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_latency_ms(), + Gt(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_latency_ms(), + Eq(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_data_status(), + Eq(NativeInitializeStats::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); +} + +TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCauseDocStoreIOError) { + { + // Initialize and index some documents. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // We need to index enough documents to make RestoreIndexIfNeeded() run + // longer than 1 ms. + for (int i = 0; i < 50; ++i) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/" + std::to_string(i)) + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + } + + // lambda to fail Read on document store header once. + bool has_failed_already = false; + auto read_lambda = [this, &has_failed_already](const char* filename, + void* buf, size_t buf_size) { + std::string document_store_header_file_path = + absl_ports::StrCat(GetDocumentDir(), "/document_store_header"); + std::string filename_string(filename); + if (!has_failed_already && + filename_string == document_store_header_file_path) { + has_failed_already = true; + return false; + } + return this->filesystem()->Read(filename, buf, buf_size); + }; + + auto mock_filesystem = std::make_unique<MockFilesystem>(); + // This fails DocumentStore::InitializeDerivedFiles() once. + ON_CALL(*mock_filesystem, Read(A<const char*>(), _, _)) + .WillByDefault(read_lambda); + + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), GetTestJniCache()); + + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_cause(), + Eq(NativeInitializeStats::IO_ERROR)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_latency_ms(), + Gt(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_data_status(), + Eq(NativeInitializeStats::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_latency_ms(), + Eq(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); +} + +TEST_F(IcingSearchEngineTest, + InitializeShouldLogRecoveryCauseSchemaStoreIOError) { + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + } + + { + // Delete the schema store header file to trigger an I/O error. + std::string schema_store_header_file_path = + GetSchemaDir() + "/schema_store_header"; + filesystem()->DeleteFile(schema_store_header_file_path.c_str()); + } + + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_cause(), + Eq(NativeInitializeStats::IO_ERROR)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .schema_store_recovery_latency_ms(), + Gt(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_recovery_latency_ms(), + Eq(0)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .document_store_data_status(), + Eq(NativeInitializeStats::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_cause(), + Eq(NativeInitializeStats::NONE)); + EXPECT_THAT(initialize_result_proto.native_initialize_stats() + .index_restoration_latency_ms(), + Eq(0)); + } +} + +TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfSchemaTypes) { + { + // Initialize an empty storage. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + // There should be 0 schema types. + EXPECT_THAT( + initialize_result_proto.native_initialize_stats().num_schema_types(), + Eq(0)); + + // Set a schema with one type config. + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + } + + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + // There should be 1 schema type. + EXPECT_THAT( + initialize_result_proto.native_initialize_stats().num_schema_types(), + Eq(1)); + + // Create and set a schema with two type configs: Email and Message. + SchemaProto schema = CreateEmailSchema(); + + auto type = schema.add_types(); + type->set_schema_type("Message"); + auto body = type->add_properties(); + body->set_property_name("body"); + body->set_data_type(PropertyConfigProto::DataType::STRING); + body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + body->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::PREFIX); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + } + + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + initialize_result_proto.native_initialize_stats().num_schema_types(), + Eq(2)); + } +} + +TEST_F(IcingSearchEngineTest, PutDocumentShouldLogFunctionLatency) { + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + PutResultProto put_result_proto = icing.Put(document); + EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(put_result_proto.native_put_document_stats().latency_ms(), Gt(0)); +} + +TEST_F(IcingSearchEngineTest, PutDocumentShouldLogDocumentStoreStats) { + // Create a large enough document so that document_store_latency_ms can be + // longer than 1 ms. + std::default_random_engine random; + std::string random_string_10000 = + RandomString(kAlNumAlphabet, /*len=*/10000, &random); + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", random_string_10000) + .Build(); + + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + PutResultProto put_result_proto = icing.Put(document); + EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + put_result_proto.native_put_document_stats().document_store_latency_ms(), + Gt(0)); + EXPECT_THAT(put_result_proto.native_put_document_stats().document_size(), + Eq(document.ByteSizeLong())); +} + +TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexingStats) { + // Create a large enough document so that index_latency_ms can be longer than + // 1 ms. + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .Build(); + + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + PutResultProto put_result_proto = icing.Put(document); + EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); + EXPECT_THAT(put_result_proto.native_put_document_stats().index_latency_ms(), + Gt(0)); + // No merge should happen. + EXPECT_THAT( + put_result_proto.native_put_document_stats().index_merge_latency_ms(), + Eq(0)); + // Number of tokens should not exceed. + EXPECT_FALSE(put_result_proto.native_put_document_stats() + .tokenization_stats() + .exceeded_max_token_num()); + // kIpsumText has 137 tokens. + EXPECT_THAT(put_result_proto.native_put_document_stats() + .tokenization_stats() + .num_tokens_indexed(), + Eq(137)); +} + +TEST_F(IcingSearchEngineTest, PutDocumentShouldLogWhetherNumTokensExceeds) { + // Create a document with 2 tokens. + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .Build(); + + // Create an icing instance with max_tokens_per_doc = 1. + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); + icing_options.set_max_tokens_per_doc(1); + IcingSearchEngine icing(icing_options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + PutResultProto put_result_proto = icing.Put(document); + EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); + // Number of tokens(2) exceeds the max allowed value(1). + EXPECT_TRUE(put_result_proto.native_put_document_stats() + .tokenization_stats() + .exceeded_max_token_num()); + EXPECT_THAT(put_result_proto.native_put_document_stats() + .tokenization_stats() + .num_tokens_indexed(), + Eq(1)); +} + +TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexMergeLatency) { + // Create 2 large enough documents so that index_merge_latency_ms can be + // longer than 1 ms. + DocumentProto document1 = DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("icing", "fake_type/2") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .Build(); + + // Create an icing instance with index_merge_size = document1's size. + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); + icing_options.set_index_merge_size(document1.ByteSizeLong()); + IcingSearchEngine icing(icing_options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk()); + + // Putting document2 should trigger an index merge. + PutResultProto put_result_proto = icing.Put(document2); + EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + put_result_proto.native_put_document_stats().index_merge_latency_ms(), + Gt(0)); +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index 47111ad..9e57993 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -37,6 +37,7 @@ #include "icing/tokenization/tokenizer.h" #include "icing/transform/normalizer.h" #include "icing/util/status-macros.h" +#include "icing/util/timer.h" namespace icing { namespace lib { @@ -56,7 +57,10 @@ IndexProcessor::Create(const SchemaStore* schema_store, } libtextclassifier3::Status IndexProcessor::IndexDocument( - const DocumentProto& document, DocumentId document_id) { + const DocumentProto& document, DocumentId document_id, + NativePutDocumentStats* put_document_stats) { + Timer index_timer; + if (index_->last_added_document_id() != kInvalidDocumentId && document_id <= index_->last_added_document_id()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( @@ -80,6 +84,12 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( tokenizer->Tokenize(subcontent)); while (itr->Advance()) { if (++num_tokens > options_.max_tokens_per_document) { + if (put_document_stats != nullptr) { + put_document_stats->mutable_tokenization_stats() + ->set_exceeded_max_token_num(true); + put_document_stats->mutable_tokenization_stats() + ->set_num_tokens_indexed(options_.max_tokens_per_document); + } switch (options_.token_limit_behavior) { case Options::TokenLimitBehavior::kReturnError: return absl_ports::ResourceExhaustedError( @@ -106,10 +116,20 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( } } + if (put_document_stats != nullptr) { + put_document_stats->set_index_latency_ms( + index_timer.GetElapsedMilliseconds()); + put_document_stats->mutable_tokenization_stats()->set_num_tokens_indexed( + num_tokens); + } + // Merge if necessary. if (overall_status.ok() && index_->WantsMerge()) { ICING_VLOG(1) << "Merging the index at docid " << document_id << "."; + + Timer merge_timer; libtextclassifier3::Status merge_status = index_->Merge(); + if (!merge_status.ok()) { ICING_LOG(ERROR) << "Index merging failed. Clearing index."; if (!index_->Reset().ok()) { @@ -123,6 +143,11 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( merge_status.error_code(), merge_status.error_message().c_str())); } } + + if (put_document_stats != nullptr) { + put_document_stats->set_index_merge_latency_ms( + merge_timer.GetElapsedMilliseconds()); + } } return overall_status; diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h index 083efea..91719d0 100644 --- a/icing/index/index-processor.h +++ b/icing/index/index-processor.h @@ -69,6 +69,9 @@ class IndexProcessor { // Indexing a document *may* trigger an index merge. If a merge fails, then // all content in the index will be lost. // + // If put_document_stats is present, the fields related to indexing will be + // populated. + // // Returns: // INVALID_ARGUMENT if document_id is less than the document_id of a // previously indexed document or tokenization fails. @@ -77,8 +80,9 @@ class IndexProcessor { // cleared as a result. // NOT_FOUND if there is no definition for the document's schema type. // INTERNAL_ERROR if any other errors occur - libtextclassifier3::Status IndexDocument(const DocumentProto& document, - DocumentId document_id); + libtextclassifier3::Status IndexDocument( + const DocumentProto& document, DocumentId document_id, + NativePutDocumentStats* put_document_stats = nullptr); private: IndexProcessor(const SchemaStore* schema_store, diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index a9b298e..584cb9b 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -76,10 +76,10 @@ void CreateFakeTypeConfig(SchemaTypeConfigProto* type_config) { IcingStringUtil::StringPrintf("p%d", i)); // p0 - p9 property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); } } diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 84c822b..e193842 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -166,9 +166,10 @@ class IndexProcessorTest : public Test { prop->set_property_name(std::string(name)); prop->set_data_type(type); prop->set_cardinality(cardinality); - prop->mutable_indexing_config()->set_term_match_type(term_match_type); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + prop->mutable_string_indexing_config()->set_term_match_type( + term_match_type); + prop->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); } static void AddNonIndexedProperty(std::string_view name, DataType::Code type, @@ -210,6 +211,7 @@ class IndexProcessorTest : public Test { prop->set_data_type(DataType::DOCUMENT); prop->set_cardinality(Cardinality::OPTIONAL); prop->set_schema_type(std::string(kNestedType)); + prop->mutable_document_indexing_config()->set_index_nested_properties(true); // Add nested type type_config = schema.add_types(); diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index df79c6d..b29217c 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -64,10 +64,10 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { property->set_property_name(indexed_property_); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); // First and only indexed property, so it gets the first id of 0 indexed_section_id_ = 0; diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc index b3696b7..a60764d 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.cc +++ b/icing/index/main/doc-hit-info-iterator-term-main.cc @@ -54,8 +54,12 @@ libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() { // next posting list in the chain. libtextclassifier3::Status status = RetrieveMoreHits(); if (!status.ok()) { - ICING_LOG(ERROR) << "Failed to retrieve more hits " - << status.error_message(); + if (!absl_ports::IsNotFound(status)) { + // NOT_FOUND is expected to happen (not every term will be in the main + // index!). Other errors are worth logging. + ICING_LOG(ERROR) << "Failed to retrieve more hits " + << status.error_message(); + } return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index 2edc624..16bd120 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -70,10 +70,10 @@ void AddIndexedProperty(SchemaTypeConfigProto* type_config, std::string name) { property_config->set_property_name(name); property_config->set_data_type(PropertyConfigProto::DataType::STRING); property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property_config->mutable_indexing_config()->set_term_match_type( + property_config->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property_config->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property_config->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); } void AddUnindexedProperty(SchemaTypeConfigProto* type_config, diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 0d2c2c5..1d1f824 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -77,18 +77,18 @@ class ResultRetrieverTest : public testing::Test { prop_config->set_property_name("subject"); prop_config->set_data_type(PropertyConfigProto::DataType::STRING); prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - prop_config->mutable_indexing_config()->set_term_match_type( + prop_config->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - prop_config->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + prop_config->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); prop_config = type_config->add_properties(); prop_config->set_property_name("body"); prop_config->set_data_type(PropertyConfigProto::DataType::STRING); prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - prop_config->mutable_indexing_config()->set_term_match_type( + prop_config->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - prop_config->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + prop_config->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); } diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index 676ea92..e552cf2 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -76,18 +76,18 @@ class SnippetRetrieverTest : public testing::Test { prop_config->set_property_name("subject"); prop_config->set_data_type(PropertyConfigProto::DataType::STRING); prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - prop_config->mutable_indexing_config()->set_term_match_type( + prop_config->mutable_string_indexing_config()->set_term_match_type( TermMatchType::PREFIX); - prop_config->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + prop_config->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); prop_config = type_config->add_properties(); prop_config->set_property_name("body"); prop_config->set_data_type(PropertyConfigProto::DataType::STRING); prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - prop_config->mutable_indexing_config()->set_term_match_type( + prop_config->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - prop_config->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + prop_config->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index 34ccf22..9173031 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -39,6 +39,7 @@ #include "icing/util/crc32.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" +#include "icing/util/timer.h" namespace icing { namespace lib { @@ -103,12 +104,13 @@ std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged( } // namespace libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create( - const Filesystem* filesystem, const std::string& base_dir) { + const Filesystem* filesystem, const std::string& base_dir, + NativeInitializeStats* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(filesystem); std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(new SchemaStore(filesystem, base_dir)); - ICING_RETURN_IF_ERROR(schema_store->Initialize()); + ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats)); return schema_store; } @@ -125,7 +127,8 @@ SchemaStore::~SchemaStore() { } } -libtextclassifier3::Status SchemaStore::Initialize() { +libtextclassifier3::Status SchemaStore::Initialize( + NativeInitializeStats* initialize_stats) { auto schema_proto_or = GetSchema(); if (absl_ports::IsNotFound(schema_proto_or.status())) { // Don't have an existing schema proto, that's fine @@ -139,10 +142,22 @@ libtextclassifier3::Status SchemaStore::Initialize() { ICING_VLOG(3) << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for SchemaStore."; + Timer regenerate_timer; + if (initialize_stats != nullptr) { + initialize_stats->set_schema_store_recovery_cause( + NativeInitializeStats::IO_ERROR); + } ICING_RETURN_IF_ERROR(RegenerateDerivedFiles()); + if (initialize_stats != nullptr) { + initialize_stats->set_schema_store_recovery_latency_ms( + regenerate_timer.GetElapsedMilliseconds()); + } } initialized_ = true; + if (initialize_stats != nullptr) { + initialize_stats->set_num_schema_types(type_config_map_.size()); + } return libtextclassifier3::Status::OK; } diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index f5c6588..76f36b4 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -27,6 +27,7 @@ #include "icing/file/file-backed-proto.h" #include "icing/file/filesystem.h" #include "icing/proto/document.pb.h" +#include "icing/proto/logging.pb.h" #include "icing/proto/schema.pb.h" #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" @@ -104,12 +105,16 @@ class SchemaStore { // outlive the created SchemaStore instance. The base_dir must already exist. // There does not need to be an existing schema already. // + // If initialize_stats is present, the fields related to SchemaStore will be + // populated. + // // Returns: // A SchemaStore on success // FAILED_PRECONDITION on any null pointer input // INTERNAL_ERROR on any IO errors static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( - const Filesystem* filesystem, const std::string& base_dir); + const Filesystem* filesystem, const std::string& base_dir, + NativeInitializeStats* initialize_stats = nullptr); // Not copyable SchemaStore(const SchemaStore&) = delete; @@ -229,7 +234,8 @@ class SchemaStore { // Returns: // OK on success // INTERNAL_ERROR on IO error - libtextclassifier3::Status Initialize(); + libtextclassifier3::Status Initialize( + NativeInitializeStats* initialize_stats); // Creates sub-components and verifies the integrity of each sub-component. // diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index 957fd89..4a458b2 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -56,10 +56,10 @@ class SchemaStoreTest : public ::testing::Test { property->set_property_name("subject"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); } void TearDown() override { @@ -444,10 +444,10 @@ TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) { // Make a previously unindexed property indexed property = schema.mutable_types(0)->mutable_properties(0); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); // With a new indexed property, we'll need to reindex result.index_incompatible = true; diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index 12f7c4c..a755e88 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -37,10 +37,6 @@ namespace lib { namespace { -// Data types that can be indexed. This follows rule 11 of SchemaUtil::Validate -static std::unordered_set<PropertyConfigProto::DataType::Code> - kIndexableDataTypes = {PropertyConfigProto::DataType::STRING}; - bool IsCardinalityCompatible(const PropertyConfigProto& old_property, const PropertyConfigProto& new_property) { if (old_property.cardinality() < new_property.cardinality()) { @@ -91,8 +87,8 @@ bool IsPropertyCompatible(const PropertyConfigProto& old_property, IsCardinalityCompatible(old_property, new_property); } -bool IsTermMatchTypeCompatible(const IndexingConfig& old_indexed, - const IndexingConfig& new_indexed) { +bool IsTermMatchTypeCompatible(const StringIndexingConfig& old_indexed, + const StringIndexingConfig& new_indexed) { return old_indexed.term_match_type() == new_indexed.term_match_type() && old_indexed.tokenizer_type() == new_indexed.tokenizer_type(); } @@ -162,9 +158,11 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { ICING_RETURN_IF_ERROR(ValidateCardinality(property_config.cardinality(), schema_type, property_name)); - ICING_RETURN_IF_ERROR( - ValidateIndexingConfig(property_config.indexing_config(), data_type, - schema_type, property_name)); + if (data_type == PropertyConfigProto::DataType::STRING) { + ICING_RETURN_IF_ERROR(ValidateStringIndexingConfig( + property_config.string_indexing_config(), data_type, schema_type, + property_name)); + } } } @@ -239,33 +237,26 @@ libtextclassifier3::Status SchemaUtil::ValidateCardinality( return libtextclassifier3::Status::OK; } -libtextclassifier3::Status SchemaUtil::ValidateIndexingConfig( - const IndexingConfig& config, PropertyConfigProto::DataType::Code data_type, - std::string_view schema_type, std::string_view property_name) { +libtextclassifier3::Status SchemaUtil::ValidateStringIndexingConfig( + const StringIndexingConfig& config, + PropertyConfigProto::DataType::Code data_type, std::string_view schema_type, + std::string_view property_name) { if (config.term_match_type() == TermMatchType::UNKNOWN && - config.tokenizer_type() != IndexingConfig::TokenizerType::NONE) { + config.tokenizer_type() != StringIndexingConfig::TokenizerType::NONE) { // They set a tokenizer type, but no term match type. return absl_ports::InvalidArgumentError(absl_ports::StrCat( - "Indexed property '", schema_type, ".", property_name, + "Indexed string property '", schema_type, ".", property_name, "' cannot have a term match type UNKNOWN")); } if (config.term_match_type() != TermMatchType::UNKNOWN && - config.tokenizer_type() == IndexingConfig::TokenizerType::NONE) { + config.tokenizer_type() == StringIndexingConfig::TokenizerType::NONE) { // They set a term match type, but no tokenizer type return absl_ports::InvalidArgumentError( - absl_ports::StrCat("Indexed property '", property_name, + absl_ports::StrCat("Indexed string property '", property_name, "' cannot have a tokenizer type of NONE")); } - if (config.term_match_type() != TermMatchType::UNKNOWN && - kIndexableDataTypes.find(data_type) == kIndexableDataTypes.end()) { - // They want this section indexed, but it's not an indexable data type. - return absl_ports::InvalidArgumentError(absl_ports::StrCat( - "Cannot index non-string data type for schema property '", schema_type, - ".", property_name, "'")); - } - return libtextclassifier3::Status::OK; } @@ -293,7 +284,7 @@ SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs( // A non-default term_match_type indicates that this property is meant to be // indexed. - if (property_config.indexing_config().term_match_type() != + if (property_config.string_indexing_config().term_match_type() != TermMatchType::UNKNOWN) { parsed_property_configs.num_indexed_properties++; } @@ -368,14 +359,15 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( // A non-default term_match_type indicates that this property is meant to // be indexed. - if (old_property_config.indexing_config().term_match_type() != + if (old_property_config.string_indexing_config().term_match_type() != TermMatchType::UNKNOWN) { ++old_indexed_properties; } // Any change in the indexed property requires a reindexing - if (!IsTermMatchTypeCompatible(old_property_config.indexing_config(), - new_property_config->indexing_config())) { + if (!IsTermMatchTypeCompatible( + old_property_config.string_indexing_config(), + new_property_config->string_indexing_config())) { schema_delta.index_incompatible = true; } } diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h index dfa3aa2..ccb2eea 100644 --- a/icing/schema/schema-util.h +++ b/icing/schema/schema-util.h @@ -80,7 +80,7 @@ class SchemaUtil { // 9. PropertyConfigProtos.schema_type's must correspond to a // SchemaTypeConfigProto.schema_type // 10. Property names can only be alphanumeric. - // 11. Only STRING data types are indexed + // 11. Any STRING data types have a valid string_indexing_config // // Returns: // ALREADY_EXISTS for case 1 and 2 @@ -137,6 +137,20 @@ class SchemaUtil { static const SchemaDelta ComputeCompatibilityDelta( const SchemaProto& old_schema, const SchemaProto& new_schema); + // Validates the 'property_name' field. + // 1. Can't be an empty string + // 2. Can only contain alphanumeric characters + // + // NOTE: schema_type is only used for logging. It is not necessary to populate + // it. + // + // RETURNS: + // - OK if property_name is valid + // - INVALID_ARGUMENT if property name is empty or contains an + // non-alphabetic character. + static libtextclassifier3::Status ValidatePropertyName( + std::string_view property_name, std::string_view schema_type = ""); + private: // Validates the 'schema_type' field // @@ -146,16 +160,6 @@ class SchemaUtil { static libtextclassifier3::Status ValidateSchemaType( std::string_view schema_type); - // Validates the 'property_name' field. - // 1. Can't be an empty string - // 2. Can only contain alphanumeric characters - // - // Returns: - // INVALID_ARGUMENT if any of the rules are not followed - // OK on success - static libtextclassifier3::Status ValidatePropertyName( - std::string_view property_name, std::string_view schema_type); - // Validates the 'data_type' field. // // Returns: @@ -174,15 +178,15 @@ class SchemaUtil { PropertyConfigProto::Cardinality::Code cardinality, std::string_view schema_type, std::string_view property_name); - // Checks that the 'indexing_config' satisfies the following rules: + // Checks that the 'string_indexing_config' satisfies the following rules: // 1. Only STRING data types can be indexed // 2. An indexed property must have a valid tokenizer type // // Returns: // INVALID_ARGUMENT if any of the rules are not followed // OK on success - static libtextclassifier3::Status ValidateIndexingConfig( - const IndexingConfig& config, + static libtextclassifier3::Status ValidateStringIndexingConfig( + const StringIndexingConfig& config, PropertyConfigProto::DataType::Code data_type, std::string_view schema_type, std::string_view property_name); }; diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index 6012989..ed3bde7 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -486,17 +486,17 @@ TEST_F(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { schema_delta.index_incompatible = true; // New schema gained a new indexed property. - old_property->mutable_indexing_config()->set_term_match_type( + old_property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::UNKNOWN); - new_property->mutable_indexing_config()->set_term_match_type( + new_property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), Eq(schema_delta)); // New schema lost an indexed property. - old_property->mutable_indexing_config()->set_term_match_type( + old_property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - new_property->mutable_indexing_config()->set_term_match_type( + new_property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::UNKNOWN); EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), Eq(schema_delta)); @@ -527,7 +527,7 @@ TEST_F(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) { new_property->set_property_name("NewIndexedProperty"); new_property->set_data_type(PropertyConfigProto::DataType::STRING); new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - new_property->mutable_indexing_config()->set_term_match_type( + new_property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); SchemaUtil::SchemaDelta schema_delta; @@ -583,15 +583,15 @@ TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) { prop->set_property_name("Foo"); prop->set_data_type(PropertyConfigProto::DataType::STRING); prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + prop->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); // Error if we don't set a term match type EXPECT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // Passes once we set a term match type - prop->mutable_indexing_config()->set_term_match_type( + prop->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); } @@ -605,7 +605,7 @@ TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) { prop->set_property_name("Foo"); prop->set_data_type(PropertyConfigProto::DataType::STRING); prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - prop->mutable_indexing_config()->set_term_match_type( + prop->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); // Error if we don't set a tokenizer type @@ -613,190 +613,11 @@ TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // Passes once we set a tokenizer type - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + prop->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); } -TEST_F(SchemaUtilTest, ValidateIntPropertyShouldntHaveIndexingConfig) { - SchemaProto schema; - auto* type = schema.add_types(); - type->set_schema_type("MyType"); - - auto* prop = type->add_properties(); - prop->set_property_name("IntProperty"); - prop->set_data_type(PropertyConfigProto::DataType::INT64); - prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - // Passes if it doesn't have indexing config - EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); - - // Fails if we try to set an indexing_config.term_match_type - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing_config.tokenizer_type - prop->mutable_indexing_config()->clear_term_match_type(); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing config - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST_F(SchemaUtilTest, ValidateDoublePropertyShouldntHaveIndexingConfig) { - SchemaProto schema; - auto* type = schema.add_types(); - type->set_schema_type("MyType"); - - auto* prop = type->add_properties(); - prop->set_property_name("DoubleProperty"); - prop->set_data_type(PropertyConfigProto::DataType::DOUBLE); - prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - // Passes if it doesn't have indexing config - EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); - - // Fails if we try to set an indexing_config.term_match_type - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing_config.tokenizer_type - prop->mutable_indexing_config()->clear_term_match_type(); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing config - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST_F(SchemaUtilTest, ValidateBooleanPropertyShouldntHaveIndexingConfig) { - SchemaProto schema; - auto* type = schema.add_types(); - type->set_schema_type("MyType"); - - auto* prop = type->add_properties(); - prop->set_property_name("BooleanProperty"); - prop->set_data_type(PropertyConfigProto::DataType::BOOLEAN); - prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - // Passes if it doesn't have indexing config - EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); - - // Fails if we try to set an indexing_config.term_match_type - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing_config.tokenizer_type - prop->mutable_indexing_config()->clear_term_match_type(); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing config - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST_F(SchemaUtilTest, ValidateBytesPropertyShouldntHaveIndexingConfig) { - SchemaProto schema; - auto* type = schema.add_types(); - type->set_schema_type("MyType"); - - auto* prop = type->add_properties(); - prop->set_property_name("BytesProperty"); - prop->set_data_type(PropertyConfigProto::DataType::BYTES); - prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - // Passes if it doesn't have indexing config - EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); - - // Fails if we try to set an indexing_config.term_match_type - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing_config.tokenizer_type - prop->mutable_indexing_config()->clear_term_match_type(); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing config - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST_F(SchemaUtilTest, ValidateDocumentPropertyShouldntHaveIndexingConfig) { - SchemaProto schema; - auto* type = schema.add_types(); - type->set_schema_type("OtherType"); - - type = schema.add_types(); - type->set_schema_type("MyType"); - - auto* prop = type->add_properties(); - prop->set_property_name("SubType"); - prop->set_schema_type("OtherType"); - prop->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - // Passes if it doesn't have indexing config - EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); - - // Fails if we try to set an indexing_config.term_match_type - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing_config.tokenizer_type - prop->mutable_indexing_config()->clear_term_match_type(); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - - // Fails if we try to set an indexing config - prop->mutable_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - prop->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); - EXPECT_THAT(SchemaUtil::Validate(schema), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - } // namespace } // namespace lib diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc index 0eed2fe..73aa947 100644 --- a/icing/schema/section-manager.cc +++ b/icing/schema/section-manager.cc @@ -119,24 +119,33 @@ libtextclassifier3::Status AssignSections( for (const auto& property_config : sorted_properties) { if (property_config.data_type() == PropertyConfigProto::DataType::DOCUMENT) { - // Tries to find sections recursively auto nested_type_config_iter = type_config_map.find(property_config.schema_type()); if (nested_type_config_iter == type_config_map.end()) { + // This should never happen because our schema should already be + // validated by this point. return absl_ports::NotFoundError(absl_ports::StrCat( - "type config not found: ", property_config.schema_type())); + "Type config not found: ", property_config.schema_type())); + } + + if (property_config.document_indexing_config() + .index_nested_properties()) { + // Assign any indexed sections recursively + const SchemaTypeConfigProto& nested_type_config = + nested_type_config_iter->second; + ICING_RETURN_IF_ERROR( + AssignSections(nested_type_config, + ConcatenatePath(current_section_path, + property_config.property_name()), + type_config_map, visited_states, metadata_list)); } - const SchemaTypeConfigProto& nested_type_config = - nested_type_config_iter->second; - ICING_RETURN_IF_ERROR( - AssignSections(nested_type_config, - ConcatenatePath(current_section_path, - property_config.property_name()), - type_config_map, visited_states, metadata_list)); } - if (property_config.indexing_config().term_match_type() == - TermMatchType::UNKNOWN) { + // Only index strings currently. + if (property_config.has_data_type() != + PropertyConfigProto::DataType::STRING || + property_config.string_indexing_config().term_match_type() == + TermMatchType::UNKNOWN) { // No need to create section for current property continue; } @@ -155,8 +164,9 @@ libtextclassifier3::Status AssignSections( } // Creates section metadata from property config metadata_list->emplace_back( - new_section_id, property_config.indexing_config().term_match_type(), - property_config.indexing_config().tokenizer_type(), + new_section_id, + property_config.string_indexing_config().term_match_type(), + property_config.string_indexing_config().tokenizer_type(), ConcatenatePath(current_section_path, property_config.property_name())); } return libtextclassifier3::Status::OK; @@ -199,16 +209,6 @@ std::vector<std::string> GetPropertyContent(const PropertyProto& property) { if (!property.string_values().empty()) { std::copy(property.string_values().begin(), property.string_values().end(), std::back_inserter(values)); - } else if (!property.int64_values().empty()) { - std::transform( - property.int64_values().begin(), property.int64_values().end(), - std::back_inserter(values), - [](int64_t i) { return IcingStringUtil::StringPrintf("%" PRId64, i); }); - } else { - std::transform( - property.double_values().begin(), property.double_values().end(), - std::back_inserter(values), - [](double d) { return IcingStringUtil::StringPrintf("%f", d); }); } return values; } @@ -264,9 +264,8 @@ SectionManager::GetSectionContent(const DocumentProto& document, // Property name not found, it could be one of the following 2 cases: // 1. The property is optional and it's not in the document // 2. The property name is invalid - return absl_ports::NotFoundError( - absl_ports::StrCat("Section path ", section_path, - " not found in type config ", document.schema())); + return absl_ports::NotFoundError(absl_ports::StrCat( + "Section path '", section_path, "' not found in document.")); } if (separator_position == std::string::npos) { @@ -275,9 +274,8 @@ SectionManager::GetSectionContent(const DocumentProto& document, if (content.empty()) { // The content of property is explicitly set to empty, we'll treat it as // NOT_FOUND because the index doesn't care about empty strings. - return absl_ports::NotFoundError( - absl_ports::StrCat("Section path ", section_path, - " not found in type config ", document.schema())); + return absl_ports::NotFoundError(absl_ports::StrCat( + "Section path '", section_path, "' content was empty")); } return content; } diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc index ad9d07d..1a4d324 100644 --- a/icing/schema/section-manager_test.cc +++ b/icing/schema/section-manager_test.cc @@ -20,6 +20,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/proto/schema.proto.h" #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/schema-util.h" @@ -29,9 +30,12 @@ namespace icing { namespace lib { + using ::testing::ElementsAre; using ::testing::Eq; using ::testing::HasSubstr; +using ::testing::IsEmpty; +using ::testing::SizeIs; // type and property names of EmailMessage constexpr char kTypeEmail[] = "EmailMessage"; @@ -93,16 +97,16 @@ class SectionManagerTest : public ::testing::Test { subject->set_property_name(kPropertySubject); subject->set_data_type(PropertyConfigProto::DataType::STRING); subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - subject->mutable_indexing_config()->set_term_match_type( + subject->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - subject->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + subject->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); auto text = type.add_properties(); text->set_property_name(kPropertyText); text->set_data_type(PropertyConfigProto::DataType::STRING); text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - text->mutable_indexing_config()->set_term_match_type( + text->mutable_string_indexing_config()->set_term_match_type( TermMatchType::UNKNOWN); auto attachment = type.add_properties(); @@ -114,10 +118,10 @@ class SectionManagerTest : public ::testing::Test { recipients->set_property_name(kPropertyRecipients); recipients->set_data_type(PropertyConfigProto::DataType::STRING); recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - recipients->mutable_indexing_config()->set_term_match_type( + recipients->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - recipients->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + recipients->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); return type; } @@ -130,7 +134,7 @@ class SectionManagerTest : public ::testing::Test { name->set_property_name(kPropertyName); name->set_data_type(PropertyConfigProto::DataType::STRING); name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - name->mutable_indexing_config()->set_term_match_type( + name->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); auto emails = type.add_properties(); @@ -138,6 +142,8 @@ class SectionManagerTest : public ::testing::Test { emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT); emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); emails->set_schema_type(kTypeEmail); + emails->mutable_document_indexing_config()->set_index_nested_properties( + true); return type; } @@ -166,6 +172,8 @@ TEST_F(SectionManagerTest, CreationWithSchemaInfiniteLoopShouldFail) { property1->set_data_type(PropertyConfigProto::DataType::DOCUMENT); property1->set_schema_type("type2"); // Here we reference type2 property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + property1->mutable_document_indexing_config()->set_index_nested_properties( + true); SchemaTypeConfigProto type_config2; type_config2.set_schema_type("type2"); @@ -175,6 +183,8 @@ TEST_F(SectionManagerTest, CreationWithSchemaInfiniteLoopShouldFail) { // Here we reference type1, which references type2 causing the infinite loop property2->set_schema_type("type1"); property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + property2->mutable_document_indexing_config()->set_index_nested_properties( + true); SchemaUtil::TypeConfigMap type_config_map; type_config_map.emplace("type1", type_config1); @@ -194,11 +204,13 @@ TEST_F(SectionManagerTest, CreationWithSchemaSelfReferenceShouldFail) { property1->set_property_name("property1"); property1->set_data_type(PropertyConfigProto::DataType::STRING); property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - property1->mutable_indexing_config()->set_term_match_type( + property1->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); auto property2 = type_config.add_properties(); property2->set_property_name("property2"); property2->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property2->mutable_document_indexing_config()->set_index_nested_properties( + true); // Here we're referencing our own type, causing an infinite loop property2->set_schema_type("type"); property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); @@ -222,7 +234,7 @@ TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) { property->set_property_name("property" + std::to_string(i)); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - property->mutable_indexing_config()->set_term_match_type( + property->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); } @@ -235,24 +247,6 @@ TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) { HasSubstr("Too many properties"))); } -TEST_F(SectionManagerTest, CreationWithUnknownSchemaTypeNameShouldFail) { - SchemaTypeConfigProto type_config; - type_config.set_schema_type("type"); - auto property = type_config.add_properties(); - property->set_property_name("property"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("unknown_name"); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - SchemaUtil::TypeConfigMap type_config_map; - type_config_map.emplace("type", type_config); - - EXPECT_THAT( - SectionManager::Create(type_config_map, schema_type_mapper_.get()), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, - HasSubstr("type config not found"))); -} - TEST_F(SectionManagerTest, GetSectionContent) { ICING_ASSERT_OK_AND_ASSIGN( auto section_manager, @@ -393,5 +387,261 @@ TEST_F(SectionManagerTest, ExtractSections) { EXPECT_THAT(sections[1].content, ElementsAre("the subject", "the subject")); } +TEST_F(SectionManagerTest, + NonStringFieldsWithStringIndexingConfigDontCreateSections) { + // Create a schema for an empty document. + SchemaTypeConfigProto empty_type; + empty_type.set_schema_type("EmptySchema"); + + // Create a schema with all the non-string fields + SchemaTypeConfigProto type_with_non_string_properties; + type_with_non_string_properties.set_schema_type("Schema"); + + // Create an int property with a string_indexing_config + auto int_property = type_with_non_string_properties.add_properties(); + int_property->set_property_name("int"); + int_property->set_data_type(PropertyConfigProto::DataType::INT64); + int_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + int_property->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + int_property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + // Create a double property with a string_indexing_config + auto double_property = type_with_non_string_properties.add_properties(); + double_property->set_property_name("double"); + double_property->set_data_type(PropertyConfigProto::DataType::DOUBLE); + double_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + double_property->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + double_property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + // Create a boolean property with a string_indexing_config + auto boolean_property = type_with_non_string_properties.add_properties(); + boolean_property->set_property_name("boolean"); + boolean_property->set_data_type(PropertyConfigProto::DataType::BOOLEAN); + boolean_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + boolean_property->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + boolean_property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + // Create a bytes property with a string_indexing_config + auto bytes_property = type_with_non_string_properties.add_properties(); + bytes_property->set_property_name("bytes"); + bytes_property->set_data_type(PropertyConfigProto::DataType::BYTES); + bytes_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + bytes_property->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + bytes_property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + // Create a document property with a string_indexing_config + auto document_property = type_with_non_string_properties.add_properties(); + document_property->set_property_name("document"); + document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + document_property->set_schema_type(empty_type.schema_type()); + document_property->set_cardinality( + PropertyConfigProto::Cardinality::REQUIRED); + document_property->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + document_property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + // Setup classes to create the section manager + SchemaUtil::TypeConfigMap type_config_map; + type_config_map.emplace(type_with_non_string_properties.schema_type(), + type_with_non_string_properties); + type_config_map.emplace(empty_type.schema_type(), empty_type); + + // KeyMapper uses 3 internal arrays for bookkeeping. Give each one 128KiB so + // the total KeyMapper should get 384KiB + int key_mapper_size = 3 * 128 * 1024; + std::string dir = GetTestTempDir() + "/non_string_fields"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper, + KeyMapper<SchemaTypeId>::Create(filesystem_, dir, key_mapper_size)); + ICING_ASSERT_OK(schema_type_mapper->Put( + type_with_non_string_properties.schema_type(), /*schema_type_id=*/0)); + ICING_ASSERT_OK(schema_type_mapper->Put(empty_type.schema_type(), + /*schema_type_id=*/1)); + + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map, schema_type_mapper.get())); + + // Create an empty document to be nested + DocumentProto empty_document = DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema(empty_type.schema_type()) + .Build(); + + // Create a document that follows "Schema" + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema(type_with_non_string_properties.schema_type()) + .AddInt64Property("int", 1) + .AddDoubleProperty("double", 0.2) + .AddBooleanProperty("boolean", true) + .AddBytesProperty("bytes", "attachment bytes") + .AddDocumentProperty("document", empty_document) + .Build(); + + // Extracts sections from 'Schema' document + ICING_ASSERT_OK_AND_ASSIGN(auto sections, + section_manager->ExtractSections(document)); + EXPECT_THAT(sections.size(), Eq(0)); +} + +TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) { + // Create the inner schema that the document property is. + SchemaTypeConfigProto document_type; + document_type.set_schema_type("DocumentSchema"); + + auto string_property = document_type.add_properties(); + string_property->set_property_name("string"); + string_property->set_data_type(PropertyConfigProto::DataType::STRING); + string_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + string_property->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + string_property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + // Create the outer schema which has the document property. + SchemaTypeConfigProto type; + type.set_schema_type("Schema"); + + auto document_property = type.add_properties(); + document_property->set_property_name("document"); + document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + document_property->set_schema_type(document_type.schema_type()); + document_property->set_cardinality( + PropertyConfigProto::Cardinality::REQUIRED); + + // Opt into recursing into the document fields. + document_property->mutable_document_indexing_config() + ->set_index_nested_properties(true); + + // Create the inner document. + DocumentProto inner_document = DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema(document_type.schema_type()) + .AddStringProperty("string", "foo") + .Build(); + + // Create the outer document that holds the inner document + DocumentProto outer_document = + DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema(type.schema_type()) + .AddDocumentProperty("document", inner_document) + .Build(); + + // Setup classes to create the section manager + SchemaUtil::TypeConfigMap type_config_map; + type_config_map.emplace(type.schema_type(), type); + type_config_map.emplace(document_type.schema_type(), document_type); + + // KeyMapper uses 3 internal arrays for bookkeeping. Give each one 128KiB so + // the total KeyMapper should get 384KiB + int key_mapper_size = 3 * 128 * 1024; + std::string dir = GetTestTempDir() + "/recurse_into_document"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper, + KeyMapper<SchemaTypeId>::Create(filesystem_, dir, key_mapper_size)); + int type_schema_type_id = 0; + int document_type_schema_type_id = 1; + ICING_ASSERT_OK( + schema_type_mapper->Put(type.schema_type(), type_schema_type_id)); + ICING_ASSERT_OK(schema_type_mapper->Put(document_type.schema_type(), + document_type_schema_type_id)); + + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map, schema_type_mapper.get())); + + // Extracts sections from 'Schema' document; there should be the 1 string + // property inside the document. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections, + section_manager->ExtractSections(outer_document)); + EXPECT_THAT(sections, SizeIs(1)); +} + +TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) { + // Create the inner schema that the document property is. + SchemaTypeConfigProto document_type; + document_type.set_schema_type("DocumentSchema"); + + auto string_property = document_type.add_properties(); + string_property->set_property_name("string"); + string_property->set_data_type(PropertyConfigProto::DataType::STRING); + string_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + string_property->mutable_string_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + string_property->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + + // Create the outer schema which has the document property. + SchemaTypeConfigProto type; + type.set_schema_type("Schema"); + + auto document_property = type.add_properties(); + document_property->set_property_name("document"); + document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); + document_property->set_schema_type(document_type.schema_type()); + document_property->set_cardinality( + PropertyConfigProto::Cardinality::REQUIRED); + + // Opt into recursing into the document fields. + document_property->mutable_document_indexing_config() + ->set_index_nested_properties(false); + + // Create the inner document. + DocumentProto inner_document = DocumentBuilder() + .SetKey("icing", "uri1") + .SetSchema(document_type.schema_type()) + .AddStringProperty("string", "foo") + .Build(); + + // Create the outer document that holds the inner document + DocumentProto outer_document = + DocumentBuilder() + .SetKey("icing", "uri2") + .SetSchema(type.schema_type()) + .AddDocumentProperty("document", inner_document) + .Build(); + + // Setup classes to create the section manager + SchemaUtil::TypeConfigMap type_config_map; + type_config_map.emplace(type.schema_type(), type); + type_config_map.emplace(document_type.schema_type(), document_type); + + // KeyMapper uses 3 internal arrays for bookkeeping. Give each one 128KiB so + // the total KeyMapper should get 384KiB + int key_mapper_size = 3 * 128 * 1024; + std::string dir = GetTestTempDir() + "/recurse_into_document"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper, + KeyMapper<SchemaTypeId>::Create(filesystem_, dir, key_mapper_size)); + int type_schema_type_id = 0; + int document_type_schema_type_id = 1; + ICING_ASSERT_OK( + schema_type_mapper->Put(type.schema_type(), type_schema_type_id)); + ICING_ASSERT_OK(schema_type_mapper->Put(document_type.schema_type(), + document_type_schema_type_id)); + + ICING_ASSERT_OK_AND_ASSIGN( + auto section_manager, + SectionManager::Create(type_config_map, schema_type_mapper.get())); + + // Extracts sections from 'Schema' document; there won't be any since we + // didn't recurse into the document to see the inner string property + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections, + section_manager->ExtractSections(outer_document)); + EXPECT_THAT(sections, IsEmpty()); +} + } // namespace lib } // namespace icing diff --git a/icing/schema/section.h b/icing/schema/section.h index daf4fd0..7669c97 100644 --- a/icing/schema/section.h +++ b/icing/schema/section.h @@ -54,9 +54,9 @@ struct SectionMetadata { // A unique id of property within a type config SectionId id; - // How content in this section should be tokenized. It is invalid for a - // section to have tokenizer == 'NONE'. - IndexingConfig::TokenizerType::Code tokenizer; + // How strings should be tokenized. It is invalid for a section to have + // tokenizer == 'NONE'. + StringIndexingConfig::TokenizerType::Code tokenizer; // How tokens in this section should be matched. // @@ -71,7 +71,7 @@ struct SectionMetadata { TermMatchType::Code term_match_type = TermMatchType::UNKNOWN; SectionMetadata(SectionId id_in, TermMatchType::Code term_match_type_in, - IndexingConfig::TokenizerType::Code tokenizer, + StringIndexingConfig::TokenizerType::Code tokenizer, std::string&& path_in) : path(std::move(path_in)), id(id_in), diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 1e47d59..8ddde14 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -35,6 +35,7 @@ #include "icing/legacy/core/icing-string-util.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" +#include "icing/proto/logging.pb.h" #include "icing/schema/schema-store.h" #include "icing/store/document-associated-score-data.h" #include "icing/store/document-filter-data.h" @@ -45,6 +46,7 @@ #include "icing/util/crc32.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" +#include "icing/util/timer.h" namespace icing { namespace lib { @@ -185,8 +187,8 @@ DocumentStore::DocumentStore(const Filesystem* filesystem, document_validator_(schema_store) {} libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( - const DocumentProto& document) { - return Put(DocumentProto(document)); + const DocumentProto& document, NativePutDocumentStats* put_document_stats) { + return Put(DocumentProto(document), put_document_stats); } DocumentStore::~DocumentStore() { @@ -200,18 +202,20 @@ DocumentStore::~DocumentStore() { libtextclassifier3::StatusOr<std::unique_ptr<DocumentStore>> DocumentStore::Create(const Filesystem* filesystem, const std::string& base_dir, - const Clock* clock, const SchemaStore* schema_store) { + const Clock* clock, const SchemaStore* schema_store, + NativeInitializeStats* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(filesystem); ICING_RETURN_ERROR_IF_NULL(clock); ICING_RETURN_ERROR_IF_NULL(schema_store); auto document_store = std::unique_ptr<DocumentStore>( new DocumentStore(filesystem, base_dir, clock, schema_store)); - ICING_RETURN_IF_ERROR(document_store->Initialize()); + ICING_RETURN_IF_ERROR(document_store->Initialize(initialize_stats)); return document_store; } -libtextclassifier3::Status DocumentStore::Initialize() { +libtextclassifier3::Status DocumentStore::Initialize( + NativeInitializeStats* initialize_stats) { auto create_result_or = FileBackedProtoLog<DocumentWrapper>::Create( filesystem_, MakeDocumentLogFilename(base_dir_), FileBackedProtoLog<DocumentWrapper>::Options( @@ -227,10 +231,30 @@ libtextclassifier3::Status DocumentStore::Initialize() { std::move(create_result_or).ValueOrDie(); document_log_ = std::move(create_result.proto_log); - if (create_result.data_loss) { + if (create_result.has_data_loss()) { ICING_LOG(WARNING) << "Data loss in document log, regenerating derived files."; + if (initialize_stats != nullptr) { + initialize_stats->set_document_store_recovery_cause( + NativeInitializeStats::DATA_LOSS); + + if (create_result.data_status == + FileBackedProtoLog<DocumentWrapper>::CreateResult::PARTIAL_LOSS) { + // Ground truth is partially lost. + initialize_stats->set_document_store_data_status( + NativeInitializeStats::PARTIAL_LOSS); + } else { + // Ground truth is completely lost. + initialize_stats->set_document_store_data_status( + NativeInitializeStats::COMPLETE_LOSS); + } + } + Timer document_recovery_timer; libtextclassifier3::Status status = RegenerateDerivedFiles(); + if (initialize_stats != nullptr) { + initialize_stats->set_document_store_recovery_latency_ms( + document_recovery_timer.GetElapsedMilliseconds()); + } if (!status.ok()) { ICING_LOG(ERROR) << "Failed to regenerate derived files for DocumentStore"; @@ -241,7 +265,16 @@ libtextclassifier3::Status DocumentStore::Initialize() { ICING_VLOG(1) << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for DocumentStore."; + if (initialize_stats != nullptr) { + initialize_stats->set_document_store_recovery_cause( + NativeInitializeStats::IO_ERROR); + } + Timer document_recovery_timer; libtextclassifier3::Status status = RegenerateDerivedFiles(); + if (initialize_stats != nullptr) { + initialize_stats->set_document_store_recovery_latency_ms( + document_recovery_timer.GetElapsedMilliseconds()); + } if (!status.ok()) { ICING_LOG(ERROR) << "Failed to regenerate derived files for DocumentStore"; @@ -251,6 +284,9 @@ libtextclassifier3::Status DocumentStore::Initialize() { } initialized_ = true; + if (initialize_stats != nullptr) { + initialize_stats->set_num_documents(document_id_mapper_->num_elements()); + } return libtextclassifier3::Status::OK; } @@ -689,9 +725,14 @@ libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) { } libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( - DocumentProto&& document) { + DocumentProto&& document, NativePutDocumentStats* put_document_stats) { + Timer put_timer; ICING_RETURN_IF_ERROR(document_validator_.Validate(document)); + if (put_document_stats != nullptr) { + put_document_stats->set_document_size(document.ByteSizeLong()); + } + // Copy fields needed before they are moved std::string name_space = document.namespace_(); std::string uri = document.uri(); @@ -765,6 +806,11 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( } } + if (put_document_stats != nullptr) { + put_document_stats->set_document_store_latency_ms( + put_timer.GetElapsedMilliseconds()); + } + return new_document_id; } diff --git a/icing/store/document-store.h b/icing/store/document-store.h index 5c1b902..d6ffbaa 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -28,6 +28,7 @@ #include "icing/file/filesystem.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" +#include "icing/proto/logging.pb.h" #include "icing/schema/schema-store.h" #include "icing/store/document-associated-score-data.h" #include "icing/store/document-filter-data.h" @@ -81,8 +82,11 @@ class DocumentStore { // previously initialized with this directory, it will reload the files saved // by the last instance. // - // Does not take any ownership, and all pointers must refer to valid objects - // that outlive the one constructed. + // If initialize_stats is present, the fields related to DocumentStore will be + // populated. + // + // Does not take any ownership, and all pointers except initialize_stats must + // refer to valid objects that outlive the one constructed. // // TODO(cassiewang): Consider returning a status indicating that derived files // were regenerated. This may be helpful in logs. @@ -93,29 +97,41 @@ class DocumentStore { // INTERNAL_ERROR on IO error static libtextclassifier3::StatusOr<std::unique_ptr<DocumentStore>> Create( const Filesystem* filesystem, const std::string& base_dir, - const Clock* clock, const SchemaStore* schema_store); + const Clock* clock, const SchemaStore* schema_store, + NativeInitializeStats* initialize_stats = nullptr); // Returns the maximum DocumentId that the DocumentStore has assigned. If // there has not been any DocumentIds assigned, i.e. the DocumentStore is // empty, then kInvalidDocumentId is returned. This does not filter out - // DocumentIds of deleted documents. - const DocumentId last_added_document_id() const { + // DocumentIds of deleted or expired documents. + DocumentId last_added_document_id() const { if (document_id_mapper_->num_elements() == 0) { return kInvalidDocumentId; } return document_id_mapper_->num_elements() - 1; } + // Returns the number of documents. The result does not filter out DocumentIds + // of deleted or expired documents. + int num_documents() const { return document_id_mapper_->num_elements(); } + // Puts the document into document store. // + // If put_document_stats is present, the fields related to DocumentStore will + // be populated. + // // Returns: // A newly generated document id on success // FAILED_PRECONDITION if schema hasn't been set yet // NOT_FOUND if the schema_type or a property config of the document doesn't // exist in schema // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<DocumentId> Put(const DocumentProto& document); - libtextclassifier3::StatusOr<DocumentId> Put(DocumentProto&& document); + libtextclassifier3::StatusOr<DocumentId> Put( + const DocumentProto& document, + NativePutDocumentStats* put_document_stats = nullptr); + libtextclassifier3::StatusOr<DocumentId> Put( + DocumentProto&& document, + NativePutDocumentStats* put_document_stats = nullptr); // Finds and returns the document identified by the given key (namespace + // uri) @@ -422,7 +438,8 @@ class DocumentStore { // worry about this field. bool initialized_ = false; - libtextclassifier3::Status Initialize(); + libtextclassifier3::Status Initialize( + NativeInitializeStats* initialize_stats); // Creates sub-components and verifies the integrity of each sub-component. // diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index 301dbdd..d97ec46 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -111,19 +111,19 @@ class DocumentStoreTest : public ::testing::Test { subject->set_property_name("subject"); subject->set_data_type(PropertyConfigProto::DataType::STRING); subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - subject->mutable_indexing_config()->set_term_match_type( + subject->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - subject->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + subject->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); auto body = type_config->add_properties(); body->set_property_name("body"); body->set_data_type(PropertyConfigProto::DataType::STRING); body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - body->mutable_indexing_config()->set_term_match_type( + body->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - body->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + body->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_)); @@ -1941,10 +1941,10 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { property_config->set_property_name("subject"); property_config->set_data_type(PropertyConfigProto::DataType::STRING); property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property_config->mutable_indexing_config()->set_term_match_type( + property_config->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property_config->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property_config->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -2168,10 +2168,10 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { property_config->set_property_name("subject"); property_config->set_data_type(PropertyConfigProto::DataType::STRING); property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property_config->mutable_indexing_config()->set_term_match_type( + property_config->mutable_string_indexing_config()->set_term_match_type( TermMatchType::EXACT_ONLY); - property_config->mutable_indexing_config()->set_tokenizer_type( - IndexingConfig::TokenizerType::PLAIN); + property_config->mutable_string_indexing_config()->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h index 7e14d0a..31d41fc 100644 --- a/icing/testing/common-matchers.h +++ b/icing/testing/common-matchers.h @@ -32,10 +32,11 @@ namespace lib { // Used to match Token(Token::Type type, std::string_view text) MATCHER_P2(EqualsToken, type, text, "") { + std::string arg_string(arg.text.data(), arg.text.length()); if (arg.type != type || arg.text != text) { *result_listener << IcingStringUtil::StringPrintf( "(Expected: type=%d, text=\"%s\". Actual: type=%d, text=\"%s\")", type, - &text[0], arg.type, arg.text.data()); + text, arg.type, arg_string.c_str()); return false; } return true; diff --git a/icing/testing/schema-generator.h b/icing/testing/schema-generator.h index e733612..863f43f 100644 --- a/icing/testing/schema-generator.h +++ b/icing/testing/schema-generator.h @@ -31,9 +31,11 @@ class ExactStringPropertyGenerator { prop.set_property_name(name.data(), name.length()); prop.set_data_type(PropertyConfigProto::DataType::STRING); prop.set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - IndexingConfig* indexing_config = prop.mutable_indexing_config(); - indexing_config->set_term_match_type(TermMatchType::EXACT_ONLY); - indexing_config->set_tokenizer_type(IndexingConfig::TokenizerType::PLAIN); + StringIndexingConfig* string_indexing_config = + prop.mutable_string_indexing_config(); + string_indexing_config->set_term_match_type(TermMatchType::EXACT_ONLY); + string_indexing_config->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); return prop; } }; diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index d43a78d..74d22cd 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -169,7 +169,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Returns true on success bool Initialize() { UErrorCode status = U_ZERO_ERROR; - utext_openUTF8(&u_text_, text_.data(), /*length=*/-1, &status); + utext_openUTF8(&u_text_, text_.data(), text_.length(), &status); break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr, /*textLength=*/0, &status); ubrk_setUText(break_iterator_, &u_text_, &status); diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc index d9db75a..df0981b 100644 --- a/icing/tokenization/plain-tokenizer_test.cc +++ b/icing/tokenization/plain-tokenizer_test.cc @@ -43,10 +43,10 @@ class PlainTokenizerTest : public ::testing::Test { }; TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) { - EXPECT_THAT( - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, /*lang_segmenter=*/nullptr), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + /*lang_segmenter=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_F(PlainTokenizerTest, Simple) { @@ -54,10 +54,10 @@ TEST_F(PlainTokenizerTest, Simple) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty())); @@ -88,10 +88,10 @@ TEST_F(PlainTokenizerTest, Whitespace) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); // There're many unicode characters that are whitespaces, here we choose tabs // to represent others. @@ -116,10 +116,10 @@ TEST_F(PlainTokenizerTest, Punctuation) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); // Half-width punctuation marks are filtered out. EXPECT_THAT(plain_tokenizer->TokenizeAll( @@ -147,10 +147,10 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); // Right now we don't have special logic for these characters, just output // them as tokens. @@ -170,10 +170,10 @@ TEST_F(PlainTokenizerTest, CJKT) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); // In plain tokenizer, CJKT characters are handled the same way as non-CJKT // characters, just add these tests as sanity checks. @@ -224,10 +224,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); constexpr std::string_view kText = "f b"; auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); @@ -243,10 +243,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); constexpr std::string_view kText = "f b"; auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); @@ -262,10 +262,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); constexpr std::string_view kText = " foo . bar baz.. bat "; EXPECT_THAT(plain_tokenizer->TokenizeAll(kText), @@ -312,10 +312,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenBefore) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<Tokenizer> plain_tokenizer, - tokenizer_factory::CreateIndexingTokenizer( - IndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, + language_segmenter.get())); constexpr std::string_view kText = " foo . bar baz.. bat "; EXPECT_THAT(plain_tokenizer->TokenizeAll(kText), diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc index 8b2edc9..50b25c5 100644 --- a/icing/tokenization/raw-query-tokenizer.cc +++ b/icing/tokenization/raw-query-tokenizer.cc @@ -26,6 +26,9 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_join.h" +#include "icing/schema/schema-util.h" +#include "icing/schema/section-manager.h" #include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/token.h" #include "icing/tokenization/tokenizer.h" @@ -70,7 +73,7 @@ constexpr char kColon = ':'; constexpr char kLeftParentheses = '('; constexpr char kRightParentheses = ')'; constexpr char kExclusion = '-'; -constexpr char kOrOperator[] = "OR"; +constexpr std::string_view kOrOperator = "OR"; enum State { // Ready to process any terms @@ -100,10 +103,14 @@ enum State { // When seeing right parentheses CLOSING_PARENTHESES = 8, + PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9, + + PROCESSING_PROPERTY_TERM_APPENDING = 10, + // Valid state count - STATE_COUNT = 9, + STATE_COUNT = 11, - INVALID = 10 + INVALID = 12 }; enum TermType { @@ -111,27 +118,29 @@ enum TermType { WHITESPACE = 0, // A term that consists of unicode alphabetic and numeric characters - ALPHANUMERIC_TERM = 1, + ASCII_ALPHANUMERIC_TERM = 1, + + NON_ASCII_ALPHABETIC_TERM = 2, // "(" - LEFT_PARENTHESES = 2, + LEFT_PARENTHESES = 3, // ")" - RIGHT_PARENTHESES = 3, + RIGHT_PARENTHESES = 4, // "-" - EXCLUSION_OPERATOR = 4, + EXCLUSION_OPERATOR = 5, // "OR" - OR_OPERATOR = 5, + OR_OPERATOR = 6, // ":" - COLON = 6, + COLON = 7, // All the other characters seen that are not the types above - OTHER = 7, + OTHER = 8, - TYPE_COUNT = 8 + TYPE_COUNT = 9 }; enum ActionOrError { @@ -145,6 +154,9 @@ enum ActionOrError { // Ignore / throw away the current term IGNORE = 2, + // Concatenate with next term + CONCATENATE = 3, + // Errors ERROR_UNKNOWN = 100, ERROR_NO_WHITESPACE_AROUND_OR = 101, @@ -154,6 +166,7 @@ enum ActionOrError { ERROR_EXCLUSION_PROPERTY_TOGETHER = 105, ERROR_EXCLUSION_OR_TOGETHER = 106, ERROR_PROPERTY_OR_TOGETHER = 107, + ERROR_NON_ASCII_AS_PROPERTY_NAME = 108, }; std::string_view GetErrorMessage(ActionOrError maybe_error) { @@ -175,6 +188,8 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) { return "Exclusion and OR operators can't be used together"; case ERROR_PROPERTY_OR_TOGETHER: return "Property restriction and OR operators can't be used together"; + case ERROR_NON_ASCII_AS_PROPERTY_NAME: + return "Characters in property name must all be ASCII."; default: return ""; } @@ -186,7 +201,7 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) { // States: // // READY = 0 -// PROCESSING_ALPHANUMERIC_TERM = 1 +// PROCESSING_ASCII_ALPHANUMERIC_TERM = 1 // PROCESSING_EXCLUSION = 2 // PROCESSING_EXCLUSION_TERM = 3 // PROCESSING_PROPERTY_RESTRICT = 4 @@ -194,24 +209,28 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) { // PROCESSING_OR = 6 // OPENING_PARENTHESES = 7 // CLOSING_PARENTHESES = 8 +// PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9 +// PROCESSING_PROPERTY_TERM_APPENDING = 10 // // Actions: // // OUTPUT = a // KEEP = b // IGNORE = c +// CONCAT = d, concatenate the current term and the new term. // -// ======================================================== -// Transition Table || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -// =========================================================================== -// WHITESPACE || 0,c | 0,a | 0,c | 0,a | 0,a | 0,a | 0,a | 0,a | 0,a | -// ALPHANUMERIC_TERM || 1,c | 1,a | 3,a | 1,a | 5,a | 1,a |ERROR| 1,a | 1,a | -// LEFT_PARENTHESES || 7,c | 7,a |ERROR| 7,a |ERROR| 7,a | 7,a | 7,a | 7,a | -// RIGHT_PARENTHESES || 8,c | 8,a | 8,c | 8,a | 8,a | 8,a | 8,c | 8,a | 8,a | -// EXCLUSION_OPERATOR || 2,c | 0,a | 2,c | 0,a |ERROR| 0,a |ERROR| 2,a | 2,a | -// OR_OPERATOR || 6,c |ERROR|ERROR|ERROR|ERROR|ERROR|ERROR| 7,b | 6,a | -// COLON || 0,c | 4,b |ERROR|ERROR| 4,b | 0,a |ERROR| 0,a |ERROR| -// OTHER || 0,c | 0,a | 0,c | 0,a | 0,a | 0,a | 0,a | 0,a | 0,a | +// ============================================================================= +// Transition || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | +// ============================================================================= +// WHITESPACE || 0,c| 0,a| 0,c| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| +// ASCII_ALPHA || 1,c| 1,d| 3,a| 1,a| 5,a| 1,a|ERR | 1,a| 1,a| 1,a|10,d| +// NONASCII_ALPHA || 9,c| 9,a| 3,a| 9,a| 5,a| 9,a|ERR | 9,a| 9,a| 9,a|10,d| +// LEFT_PAREN || 7,c| 7,a|ERR | 7,a|ERR | 7,a| 7,a| 7,a| 7,a| 7,a| 7,a| +// RIGHT_PAREN || 8,c| 8,a| 8,c| 8,a| 8,a| 8,a| 8,c| 8,a| 8,a| 8,a| 8,a| +// EXCLUSION_OP || 2,c| 0,a| 2,c| 0,a|ERR | 0,a|ERR | 2,a| 2,a| 0,a| 0,a| +// OR_OPERATOR || 6,c|ERR |ERR |ERR |ERR |ERR |ERR | 7,b| 6,a|ERR |ERR | +// COLON || 0,c| 4,b|ERR |ERR | 4,b|10,d|ERR | 0,a|ERR |ERR |10,d| +// OTHER || 0,c| 0,a| 0,c| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| // // Each cell is a rule that consists of 4 things: // [current state] + [next term type] -> [new state] + [action] @@ -234,33 +253,46 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) { // like "+", "&", "@", "#" in indexing and query tokenizers. constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = { /*State: Ready*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES, - CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, READY, READY}, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, + PROCESSING_OR, READY, READY}, /*State: PROCESSING_ALPHANUMERIC_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES, - CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_RESTRICT, READY}, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, + PROCESSING_PROPERTY_RESTRICT, READY}, /*State: PROCESSING_EXCLUSION*/ - {READY, PROCESSING_EXCLUSION_TERM, INVALID, CLOSING_PARENTHESES, - PROCESSING_EXCLUSION, INVALID, INVALID, READY}, + {READY, PROCESSING_EXCLUSION_TERM, PROCESSING_EXCLUSION_TERM, INVALID, + CLOSING_PARENTHESES, PROCESSING_EXCLUSION, INVALID, INVALID, READY}, /*State: PROCESSING_EXCLUSION_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES, - CLOSING_PARENTHESES, READY, INVALID, INVALID, READY}, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY}, /*State: PROCESSING_PROPERTY_RESTRICT*/ - {READY, PROCESSING_PROPERTY_TERM, INVALID, CLOSING_PARENTHESES, INVALID, - INVALID, PROCESSING_PROPERTY_RESTRICT, READY}, + {READY, PROCESSING_PROPERTY_TERM, PROCESSING_PROPERTY_TERM, INVALID, + CLOSING_PARENTHESES, INVALID, INVALID, PROCESSING_PROPERTY_RESTRICT, + READY}, /*State: PROCESSING_PROPERTY_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES, - CLOSING_PARENTHESES, READY, INVALID, READY, READY}, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, + PROCESSING_PROPERTY_TERM_APPENDING, READY}, /*State: PROCESSING_OR*/ - {READY, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID, INVALID, - INVALID, READY}, + {READY, INVALID, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID, + INVALID, INVALID, READY}, /*State: OPENING_PARENTHESES*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES, - CLOSING_PARENTHESES, PROCESSING_EXCLUSION, OPENING_PARENTHESES, READY, - READY}, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, + OPENING_PARENTHESES, READY, READY}, /*State: CLOSING_PARENTHESES*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES, - CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, INVALID, READY}}; + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, + PROCESSING_OR, INVALID, READY}, + /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/ + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY}, + /*State: PROCESSING_PROPERTY_TERM_APPENDING*/ + {READY, PROCESSING_PROPERTY_TERM_APPENDING, + PROCESSING_PROPERTY_TERM_APPENDING, OPENING_PARENTHESES, + CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_TERM_APPENDING, + READY}}; // We use a 2D array to encode the action rules, // The value of action_rules[state1][term_type1] means "what action we need to @@ -269,62 +301,121 @@ constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = { // NOTE: Please update the state transition table above if this is updated. constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = { /*State: Ready*/ - {IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE}, + {IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE}, /*State: PROCESSING_ALPHANUMERIC_TERM*/ - {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, - KEEP, OUTPUT}, + {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, + ERROR_NO_WHITESPACE_AROUND_OR, KEEP, OUTPUT}, /*State: PROCESSING_EXCLUSION*/ - {IGNORE, OUTPUT, ERROR_GROUP_AFTER_EXCLUSION, IGNORE, IGNORE, + {IGNORE, OUTPUT, OUTPUT, ERROR_GROUP_AFTER_EXCLUSION, IGNORE, IGNORE, ERROR_EXCLUSION_OR_TOGETHER, ERROR_EXCLUSION_PROPERTY_TOGETHER, IGNORE}, /*State: PROCESSING_EXCLUSION_TERM*/ - {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, - ERROR_EXCLUSION_PROPERTY_TOGETHER, OUTPUT}, + {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, + ERROR_NO_WHITESPACE_AROUND_OR, ERROR_EXCLUSION_PROPERTY_TOGETHER, OUTPUT}, /*State: PROCESSING_PROPERTY_RESTRICT*/ - {OUTPUT, OUTPUT, ERROR_GROUP_AFTER_PROPERTY_RESTRICTION, OUTPUT, + {OUTPUT, OUTPUT, OUTPUT, ERROR_GROUP_AFTER_PROPERTY_RESTRICTION, OUTPUT, ERROR_EXCLUSION_PROPERTY_TOGETHER, ERROR_PROPERTY_OR_TOGETHER, KEEP, OUTPUT}, /*State: PROCESSING_PROPERTY_TERM*/ - {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, - OUTPUT, OUTPUT}, + {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, + ERROR_NO_WHITESPACE_AROUND_OR, CONCATENATE, OUTPUT}, /*State: PROCESSING_OR*/ - {OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT, IGNORE, - ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR, - ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT}, + {OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR, + OUTPUT, IGNORE, ERROR_NO_WHITESPACE_AROUND_OR, + ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT}, /*State: OPENING_PARENTHESES*/ - {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, KEEP, OUTPUT, OUTPUT}, + {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, KEEP, OUTPUT, OUTPUT}, /*State: CLOSING_PARENTHESES*/ + {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, + ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT}, + /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, - ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT}}; - -// Helper function to get the TermType of the input term. -TermType GetTermType(std::string_view term) { - if (term.length() == 1) { - // Must be an ASCII char - const char& first_term_char = term[0]; - if (first_term_char == kWhitespace) { - return WHITESPACE; - } else if (first_term_char == kColon) { - return COLON; - } else if (first_term_char == kLeftParentheses) { - return LEFT_PARENTHESES; - } else if (first_term_char == kRightParentheses) { - return RIGHT_PARENTHESES; - } else if (first_term_char == kExclusion) { - return EXCLUSION_OPERATOR; - } - } else if (term.length() == 2 && term == kOrOperator) { - return OR_OPERATOR; + ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NON_ASCII_AS_PROPERTY_NAME, OUTPUT}, + /*State: PROCESSING_PROPERTY_TERM_APPENDING*/ + {OUTPUT, CONCATENATE, CONCATENATE, OUTPUT, OUTPUT, OUTPUT, + ERROR_NO_WHITESPACE_AROUND_OR, CONCATENATE, OUTPUT}}; + +// Determines the length of the whitespace term beginning at text[pos] and +// returns a pair with the WHITESPACE TermType and a string_view of the +// whitespace term. +std::pair<TermType, std::string_view> GetWhitespaceTerm(std::string_view text, + size_t pos) { + size_t cur = pos; + while (cur < text.length() && text[cur] == kWhitespace) { + ++cur; } + return std::make_pair(WHITESPACE, text.substr(pos, cur - pos)); +} + +// Determines the length of the potential content term beginning at text[pos] +// and returns a pair with the appropriate TermType and a string_view of the +// content term. +// +// NOTE: The potential content term could multiple content terms (segmentation +// is needed to determine this), a property restrict (depending on other +// neighboring tokens). It could also be multiple content terms surrounding an +// OR operator (segmentation is also needed to determine this). +std::pair<TermType, std::string_view> GetContentTerm(std::string_view text, + size_t pos) { + size_t len = 0; // Checks the first char to see if it's an ASCII term - if (i18n_utils::IsAscii(term[0])) { - if (std::isalnum(term[0])) { - return ALPHANUMERIC_TERM; + TermType type = ASCII_ALPHANUMERIC_TERM; + if (!i18n_utils::IsAscii(text[pos])) { + type = NON_ASCII_ALPHABETIC_TERM; + } else if (std::isalnum(text[pos])) { + type = OTHER; + } + for (size_t cur = pos; cur < text.length() && len == 0; ++cur) { + switch (text[cur]) { + case kLeftParentheses: + [[fallthrough]]; + case kRightParentheses: + [[fallthrough]]; + case kExclusion: + [[fallthrough]]; + case kWhitespace: + [[fallthrough]]; + case kColon: + // If we reach any of our special characters (colon, exclusion or + // parentheses), then we've reached the end of the content term. Set len + // and exit the loop. + len = cur - pos; + break; + default: + break; } - return OTHER; } - // All non-ASCII terms are alphabetic since language segmenter already - // filters out non-ASCII and non-alphabetic terms - return ALPHANUMERIC_TERM; + if (len == 0) { + // If len isn't set, then we must have reached the end of the string. + len = text.length() - pos; + } + return std::make_pair(type, text.substr(pos, len)); +} + +// Determines the type and length of the term beginning at text[pos]. +std::pair<TermType, std::string_view> GetTerm(std::string_view text, + size_t pos) { + switch (text[pos]) { + case kLeftParentheses: + return std::make_pair(LEFT_PARENTHESES, text.substr(pos, 1)); + case kRightParentheses: + return std::make_pair(RIGHT_PARENTHESES, text.substr(pos, 1)); + case kExclusion: + return std::make_pair(EXCLUSION_OPERATOR, text.substr(pos, 1)); + case kWhitespace: + // Get length of whitespace + return GetWhitespaceTerm(text, pos); + case kColon: + return std::make_pair(COLON, text.substr(pos, 1)); + case kOrOperator[0]: + if (text.length() >= pos + kOrOperator.length() && + text.substr(pos, kOrOperator.length()) == kOrOperator) { + return std::make_pair(OR_OPERATOR, + text.substr(pos, kOrOperator.length())); + } + [[fallthrough]]; + default: + return GetContentTerm(text, pos); + } } // Helper function to remove the last token if it's OR operator. This is used to @@ -378,12 +469,18 @@ libtextclassifier3::Status OutputToken(State new_state, TermType current_term_type, std::vector<Token>* tokens) { switch (current_term_type) { - case ALPHANUMERIC_TERM: + case ASCII_ALPHANUMERIC_TERM: + [[fallthrough]]; + case NON_ASCII_ALPHABETIC_TERM: if (new_state == PROCESSING_PROPERTY_TERM) { - // Asserts extra rule 1: property name must be in ASCII - if (!i18n_utils::IsAscii(current_term[0])) { - return absl_ports::InvalidArgumentError( - "Characters in property name must all be ASCII."); + // Asserts extra rule 1: each property name in the property path is a + // valid term. + for (std::string_view property : + absl_ports::StrSplit(current_term, kPropertySeparator)) { + if (!SchemaUtil::ValidatePropertyName(property).ok()) { + return absl_ports::InvalidArgumentError( + GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME)); + } } tokens->emplace_back(Token::QUERY_PROPERTY, current_term); } else { @@ -416,13 +513,11 @@ libtextclassifier3::Status OutputToken(State new_state, // Returns: // OK on success // INVALID_ARGUMENT with error message on invalid query syntax -libtextclassifier3::Status ProcessTerm(State* current_state, - std::string_view* current_term, - TermType* current_term_type, - int* unclosed_parentheses_count, - const std::string_view next_term, - TermType next_term_type, - std::vector<Token>* tokens) { +libtextclassifier3::Status ProcessTerm( + State* current_state, std::string_view* current_term, + TermType* current_term_type, int* unclosed_parentheses_count, + const std::string_view next_term, TermType next_term_type, + const LanguageSegmenter* language_segmenter, std::vector<Token>* tokens) { // Asserts extra rule 4: parentheses must appear in pairs. if (next_term_type == LEFT_PARENTHESES) { ++(*unclosed_parentheses_count); @@ -440,8 +535,25 @@ libtextclassifier3::Status ProcessTerm(State* current_state, } switch (action_or_error) { case OUTPUT: - ICING_RETURN_IF_ERROR( - OutputToken(new_state, *current_term, *current_term_type, tokens)); + if (*current_state == PROCESSING_PROPERTY_TERM_APPENDING) { + // We appended multiple terms together in case they actually should have + // been connected by a colon connector. + ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> content_terms, + language_segmenter->GetAllTerms(*current_term)); + for (std::string_view term : content_terms) { + TermType type = ASCII_ALPHANUMERIC_TERM; + if (!i18n_utils::IsAscii(term[0])) { + type = NON_ASCII_ALPHABETIC_TERM; + } else if (!std::isalnum(term[0])) { + // Skip OTHER tokens here. + continue; + } + ICING_RETURN_IF_ERROR(OutputToken(new_state, term, type, tokens)); + } + } else { + ICING_RETURN_IF_ERROR( + OutputToken(new_state, *current_term, *current_term_type, tokens)); + } [[fallthrough]]; case IGNORE: *current_term = next_term; @@ -449,6 +561,11 @@ libtextclassifier3::Status ProcessTerm(State* current_state, break; case KEEP: break; + case CONCATENATE: + *current_term = std::string_view( + current_term->data(), + next_term.data() - current_term->data() + next_term.length()); + break; default: return absl_ports::InvalidArgumentError(GetErrorMessage(ERROR_UNKNOWN)); } @@ -463,56 +580,55 @@ libtextclassifier3::Status ProcessTerm(State* current_state, // A list of tokens on success // INVALID_ARGUMENT with error message on invalid query syntax libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms( - std::unique_ptr<LanguageSegmenter::Iterator> base_iterator) { + const LanguageSegmenter* language_segmenter, + std::vector<std::pair<TermType, std::string_view>> prescanned_terms) { std::vector<Token> tokens; State current_state = READY; std::string_view current_term; TermType current_term_type; int unclosed_parentheses_count = 0; - while (base_iterator->Advance()) { - const std::string_view next_term = base_iterator->GetTerm(); - size_t colon_position = next_term.find(kColon); - // Since colon ":" is a word connector per ICU's rule - // (https://unicode.org/reports/tr29/#Word_Boundaries), strings like - // "foo:bar" are returned by LanguageSegmenter as one term. Here we're - // trying to find the first colon as it represents property restriction in - // raw query. - if (colon_position == std::string_view::npos) { - // No colon found - ICING_RETURN_IF_ERROR(ProcessTerm(¤t_state, ¤t_term, - ¤t_term_type, - &unclosed_parentheses_count, next_term, - GetTermType(next_term), &tokens)); - } else if (next_term.size() == 1 && next_term[0] == kColon) { - // The whole term is a colon + for (int i = 0; i < prescanned_terms.size(); ++i) { + const std::pair<TermType, std::string_view>& prescanned_term = + prescanned_terms.at(i); + if (prescanned_term.first != ASCII_ALPHANUMERIC_TERM && + prescanned_term.first != NON_ASCII_ALPHABETIC_TERM && + prescanned_term.first != OTHER) { + // This can't be a property restrict. Just pass it in. ICING_RETURN_IF_ERROR( ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, - &unclosed_parentheses_count, next_term, COLON, &tokens)); + &unclosed_parentheses_count, prescanned_term.second, + prescanned_term.first, language_segmenter, &tokens)); } else { - // String before the colon is the property name - std::string_view property_name = next_term.substr(0, colon_position); - ICING_RETURN_IF_ERROR( - ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, - &unclosed_parentheses_count, property_name, - GetTermType(property_name), &tokens)); - ICING_RETURN_IF_ERROR( - ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, - &unclosed_parentheses_count, std::string_view(&kColon, 1), - COLON, &tokens)); - // String after the colon is the term that property restriction is applied - // on. - std::string_view property_term = next_term.substr(colon_position + 1); - ICING_RETURN_IF_ERROR( - ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, - &unclosed_parentheses_count, property_term, - GetTermType(property_term), &tokens)); + // There's no colon after this term. Now, we need to segment this. + ICING_ASSIGN_OR_RETURN( + std::vector<std::string_view> content_terms, + language_segmenter->GetAllTerms(prescanned_term.second)); + for (std::string_view term : content_terms) { + TermType type = ASCII_ALPHANUMERIC_TERM; + if (term == kOrOperator) { + // TODO(tjbarron) Decide whether we should revise this and other + // handled syntax. This is used to allow queries like "term1,OR,term2" + // to succeed. It's not clear if we should allow this or require + // clients to ensure that OR operators are always surrounded by + // whitespace. + type = OR_OPERATOR; + } else if (!i18n_utils::IsAscii(term[0])) { + type = NON_ASCII_ALPHABETIC_TERM; + } else if (!std::isalnum(term[0])) { + type = OTHER; + } + ICING_RETURN_IF_ERROR(ProcessTerm(¤t_state, ¤t_term, + ¤t_term_type, + &unclosed_parentheses_count, term, + type, language_segmenter, &tokens)); + } } } // Adds a fake whitespace at the end to flush the last term. - ICING_RETURN_IF_ERROR( - ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, - &unclosed_parentheses_count, - std::string_view(&kWhitespace, 1), WHITESPACE, &tokens)); + ICING_RETURN_IF_ERROR(ProcessTerm( + ¤t_state, ¤t_term, ¤t_term_type, + &unclosed_parentheses_count, std::string_view(&kWhitespace, 1), + WHITESPACE, language_segmenter, &tokens)); if (unclosed_parentheses_count > 0) { return absl_ports::InvalidArgumentError("Unclosed left parentheses."); } @@ -553,10 +669,16 @@ RawQueryTokenizer::Tokenize(std::string_view text) const { libtextclassifier3::StatusOr<std::vector<Token>> RawQueryTokenizer::TokenizeAll( std::string_view text) const { - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<LanguageSegmenter::Iterator> base_iterator, - language_segmenter_.Segment(text)); - return ProcessTerms(std::move(base_iterator)); + // 1. Prescan all terms in the text, to determine which ones are potentially + // content and which ones are not. + std::vector<std::pair<TermType, std::string_view>> prescanned_terms; + for (size_t pos = 0; pos < text.length();) { + std::pair<TermType, std::string_view> term_pair = GetTerm(text, pos); + pos += term_pair.second.length(); + prescanned_terms.push_back(term_pair); + } + // 2. Process the prescanned terms, segmenting content terms as needed. + return ProcessTerms(&language_segmenter_, std::move(prescanned_terms)); } } // namespace lib diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index 9b71e8a..d4af9ed 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -59,6 +59,10 @@ TEST_F(RawQueryTokenizerTest, Simple) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"), IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), EqualsToken(Token::REGULAR, "World")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"), + EqualsToken(Token::REGULAR, "WORLD")))); } TEST_F(RawQueryTokenizerTest, Parentheses) { @@ -293,6 +297,12 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) { EqualsToken(Token::REGULAR, "term2")))); EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("property1:今天:天气"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), + EqualsToken(Token::REGULAR, "今天"), + EqualsToken(Token::REGULAR, "天气")))); + + EXPECT_THAT( raw_query_tokenizer->TokenizeAll("property1:term1-"), IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), EqualsToken(Token::REGULAR, "term1")))); diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc index 9ebbce5..9b59acf 100644 --- a/icing/tokenization/tokenizer-factory.cc +++ b/icing/tokenization/tokenizer-factory.cc @@ -31,14 +31,14 @@ namespace lib { namespace tokenizer_factory { libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer>> -CreateIndexingTokenizer(IndexingConfig::TokenizerType::Code type, +CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type, const LanguageSegmenter* lang_segmenter) { ICING_RETURN_ERROR_IF_NULL(lang_segmenter); switch (type) { - case IndexingConfig::TokenizerType::PLAIN: + case StringIndexingConfig::TokenizerType::PLAIN: return std::make_unique<PlainTokenizer>(lang_segmenter); - case IndexingConfig::TokenizerType::NONE: + case StringIndexingConfig::TokenizerType::NONE: [[fallthrough]]; default: // This should never happen. diff --git a/icing/tokenization/tokenizer-factory.h b/icing/tokenization/tokenizer-factory.h index f81fd96..8b9226d 100644 --- a/icing/tokenization/tokenizer-factory.h +++ b/icing/tokenization/tokenizer-factory.h @@ -37,7 +37,7 @@ namespace tokenizer_factory { // FAILED_PRECONDITION on any null pointer input // INVALID_ARGUMENT if tokenizer type is invalid libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer>> -CreateIndexingTokenizer(IndexingConfig::TokenizerType::Code type, +CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type, const LanguageSegmenter* lang_segmenter); // All the supported query tokenizer types |