aboutsummaryrefslogtreecommitdiff
path: root/icing
diff options
context:
space:
mode:
authorAlexander Dorokhine <adorokhine@google.com>2020-11-05 15:28:28 -0800
committerAlexander Dorokhine <adorokhine@google.com>2020-11-05 15:29:03 -0800
commite111bb917a63282b8e11012acb8f97cba882f342 (patch)
tree1c59a59fa2c918d9a3f512a0e4b7d4934f6bb07b /icing
parent71b8eddc99c6337ff304a2f3cd0588c42239202f (diff)
downloadicing-e111bb917a63282b8e11012acb8f97cba882f342.tar.gz
Update icing from upstream.
Change-Id: Ia63a77142ec717c0d9a81ec0a5c1267381858200
Diffstat (limited to 'icing')
-rw-r--r--icing/absl_ports/str_join.cc41
-rw-r--r--icing/absl_ports/str_join.h4
-rw-r--r--icing/file/file-backed-proto-log.h34
-rw-r--r--icing/file/file-backed-proto-log_test.cc42
-rw-r--r--icing/icing-search-engine-with-icu-file_test.cc7
-rw-r--r--icing/icing-search-engine.cc163
-rw-r--r--icing/icing-search-engine.h22
-rw-r--r--icing/icing-search-engine_fuzz_test.cc7
-rw-r--r--icing/icing-search-engine_test.cc798
-rw-r--r--icing/index/index-processor.cc27
-rw-r--r--icing/index/index-processor.h8
-rw-r--r--icing/index/index-processor_benchmark.cc6
-rw-r--r--icing/index/index-processor_test.cc8
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc6
-rw-r--r--icing/index/main/doc-hit-info-iterator-term-main.cc8
-rw-r--r--icing/query/query-processor_test.cc6
-rw-r--r--icing/result/result-retriever_test.cc12
-rw-r--r--icing/result/snippet-retriever_test.cc12
-rw-r--r--icing/schema/schema-store.cc21
-rw-r--r--icing/schema/schema-store.h10
-rw-r--r--icing/schema/schema-store_test.cc12
-rw-r--r--icing/schema/schema-util.cc48
-rw-r--r--icing/schema/schema-util.h32
-rw-r--r--icing/schema/schema-util_test.cc201
-rw-r--r--icing/schema/section-manager.cc56
-rw-r--r--icing/schema/section-manager_test.cc306
-rw-r--r--icing/schema/section.h8
-rw-r--r--icing/store/document-store.cc60
-rw-r--r--icing/store/document-store.h33
-rw-r--r--icing/store/document-store_test.cc24
-rw-r--r--icing/testing/common-matchers.h3
-rw-r--r--icing/testing/schema-generator.h8
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc2
-rw-r--r--icing/tokenization/plain-tokenizer_test.cc80
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc400
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc10
-rw-r--r--icing/tokenization/tokenizer-factory.cc6
-rw-r--r--icing/tokenization/tokenizer-factory.h2
38 files changed, 1853 insertions, 680 deletions
diff --git a/icing/absl_ports/str_join.cc b/icing/absl_ports/str_join.cc
new file mode 100644
index 0000000..2d105ca
--- /dev/null
+++ b/icing/absl_ports/str_join.cc
@@ -0,0 +1,41 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/absl_ports/str_join.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+std::vector<std::string_view> StrSplit(std::string_view text,
+ std::string_view sep) {
+ std::vector<std::string_view> substrings;
+ size_t separator_position = text.find(sep);
+ size_t current_start = 0;
+ size_t current_end = separator_position;
+ while (separator_position != std::string_view::npos) {
+ substrings.push_back(
+ text.substr(current_start, current_end - current_start));
+ current_start = current_end + sep.length();
+ separator_position = text.find(sep, current_start);
+ current_end = separator_position;
+ }
+ current_end = text.length();
+ substrings.push_back(text.substr(current_start, current_end - current_start));
+ return substrings;
+}
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
diff --git a/icing/absl_ports/str_join.h b/icing/absl_ports/str_join.h
index 7c8936a..f66a977 100644
--- a/icing/absl_ports/str_join.h
+++ b/icing/absl_ports/str_join.h
@@ -17,6 +17,7 @@
#include <string>
#include <string_view>
+#include <vector>
#include "icing/absl_ports/str_cat.h"
@@ -104,6 +105,9 @@ std::string StrJoin(const Container& container, std::string_view sep) {
return absl_ports::StrJoin(container, sep, DefaultFormatter());
}
+std::vector<std::string_view> StrSplit(std::string_view text,
+ std::string_view sep);
+
} // namespace absl_ports
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 95511ac..aa5a031 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -168,11 +168,15 @@ class FileBackedProtoLog {
// A successfully initialized log.
std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log;
- // Whether there was some data loss while initializing from a previous
- // state. This can happen if the file is corrupted or some previously added
- // data was unpersisted. This may be used to signal that any derived data
- // off of the proto log may need to be regenerated.
- bool data_loss;
+ // The data status after initializing from a previous state. Data loss can
+ // happen if the file is corrupted or some previously added data was
+ // unpersisted. This may be used to signal that any derived data off of the
+ // proto log may need to be regenerated.
+ enum DataStatus { NO_DATA_LOSS, PARTIAL_LOSS, COMPLETE_LOSS } data_status;
+
+ bool has_data_loss() {
+ return data_status == PARTIAL_LOSS || data_status == COMPLETE_LOSS;
+ }
};
// Factory method to create, initialize, and return a FileBackedProtoLog. Will
@@ -182,10 +186,11 @@ class FileBackedProtoLog {
// added data was unpersisted, the log will rewind to the last-good state. The
// log saves these checkpointed "good" states when PersistToDisk() is called
// or the log is safely destructed. If the log rewinds successfully to the
- // last-good state, then the returned CreateResult.data_loss indicates
- // there was some data loss so that any derived data may know that it
- // needs to be updated. If the log re-initializes successfully without any
- // data loss, the boolean will be false.
+ // last-good state, then the returned CreateResult.data_status indicates
+ // whether it has a data loss and what kind of data loss it is (partial or
+ // complete) so that any derived data may know that it needs to be updated. If
+ // the log re-initializes successfully without any data loss,
+ // CreateResult.data_status will be NO_DATA_LOSS.
//
// Params:
// filesystem: Handles system level calls
@@ -511,7 +516,7 @@ FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem,
std::unique_ptr<FileBackedProtoLog<ProtoT>>(
new FileBackedProtoLog<ProtoT>(filesystem, file_path,
std::move(header))),
- /*data_loss=*/false};
+ /*data_status=*/CreateResult::NO_DATA_LOSS};
return create_result;
}
@@ -561,15 +566,14 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
}
header->max_proto_size = options.max_proto_size;
- bool data_loss = false;
+ typename CreateResult::DataStatus data_status = CreateResult::NO_DATA_LOSS;
ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
ComputeChecksum(filesystem, file_path, Crc32(),
sizeof(Header), file_size));
// Double check that the log checksum is the same as the one that was
// persisted last time. If not, we start recovery logic.
if (header->log_checksum != calculated_log_checksum.Get()) {
- // Need to rewind the proto log since the checksums don't match
- data_loss = true;
+ // Need to rewind the proto log since the checksums don't match.
// Worst case, we have to rewind the entire log back to just the header
int64_t last_known_good = sizeof(Header);
@@ -585,10 +589,12 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
// Check if it matches our last rewind state. If so, this becomes our last
// good state and we can safely truncate and recover from here.
last_known_good = header->rewind_offset;
+ data_status = CreateResult::PARTIAL_LOSS;
} else {
// Otherwise, we're going to truncate the entire log and this resets the
// checksum to an empty log state.
header->log_checksum = 0;
+ data_status = CreateResult::COMPLETE_LOSS;
}
if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
@@ -604,7 +610,7 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
std::unique_ptr<FileBackedProtoLog<ProtoT>>(
new FileBackedProtoLog<ProtoT>(filesystem, file_path,
std::move(header))),
- data_loss};
+ data_status};
return create_result;
}
diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
index fad5248..7410d2b 100644
--- a/icing/file/file-backed-proto-log_test.cc
+++ b/icing/file/file-backed-proto-log_test.cc
@@ -77,7 +77,7 @@ TEST_F(FileBackedProtoLogTest, Initialize) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
EXPECT_THAT(create_result.proto_log, NotNull());
- EXPECT_FALSE(create_result.data_loss);
+ EXPECT_FALSE(create_result.has_data_loss());
// Can't recreate the same file with different options.
ASSERT_THAT(FileBackedProtoLog<DocumentProto>::Create(
@@ -96,7 +96,7 @@ TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
@@ -113,7 +113,7 @@ TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Write a proto
DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
@@ -147,7 +147,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/false, max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Write the first proto
DocumentProto document1 =
@@ -194,7 +194,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/false, max_proto_size_)));
auto recreated_proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Write a third proto
DocumentProto document3 =
@@ -216,7 +216,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/true, max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Write the first proto
DocumentProto document1 =
@@ -263,7 +263,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/true, max_proto_size_)));
auto recreated_proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Write a third proto
DocumentProto document3 =
@@ -283,7 +283,7 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto recreated_proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ EXPECT_FALSE(create_result.has_data_loss());
int corrupt_offset =
offsetof(FileBackedProtoLog<DocumentProto>::Header, rewind_offset);
@@ -312,7 +312,7 @@ TEST_F(FileBackedProtoLogTest, CorruptContent) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ EXPECT_FALSE(create_result.has_data_loss());
DocumentProto document =
DocumentBuilder().SetKey("namespace1", "uri1").Build();
@@ -338,7 +338,7 @@ TEST_F(FileBackedProtoLogTest, CorruptContent) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_TRUE(create_result.data_loss);
+ ASSERT_TRUE(create_result.has_data_loss());
// Lost everything in the log since the rewind position doesn't help if
// there's been data corruption within the persisted region
@@ -363,7 +363,7 @@ TEST_F(FileBackedProtoLogTest, PersistToDisk) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Write and persist the first proto
ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
@@ -407,7 +407,7 @@ TEST_F(FileBackedProtoLogTest, PersistToDisk) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_TRUE(create_result.data_loss);
+ ASSERT_TRUE(create_result.has_data_loss());
// Check that everything was persisted across instances
ASSERT_THAT(proto_log->ReadProto(document1_offset),
@@ -433,7 +433,7 @@ TEST_F(FileBackedProtoLogTest, Iterator) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
{
// Empty iterator
@@ -484,7 +484,7 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
ICING_EXPECT_OK(proto_log->WriteProto(document));
@@ -502,7 +502,7 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Checksum should be consistent across instances
EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
@@ -528,7 +528,7 @@ TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Writes and erases proto
ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
@@ -561,7 +561,7 @@ TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Writes 2 protos
ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
@@ -603,7 +603,7 @@ TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Writes 3 protos
ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
@@ -631,7 +631,7 @@ TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Erases the 2nd proto that is now before the rewind position. Checksum is
// updated.
@@ -651,7 +651,7 @@ TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- ASSERT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.has_data_loss());
// Append a new document which is after the rewind position.
ICING_ASSERT_OK(proto_log->WriteProto(document4));
@@ -673,7 +673,7 @@ TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ EXPECT_FALSE(create_result.has_data_loss());
}
}
diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc
index 32ac9e6..1cb8620 100644
--- a/icing/icing-search-engine-with-icu-file_test.cc
+++ b/icing/icing-search-engine-with-icu-file_test.cc
@@ -63,9 +63,10 @@ SchemaProto CreateMessageSchema() {
body->set_property_name("body");
body->set_data_type(PropertyConfigProto::DataType::STRING);
body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
return schema;
}
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index fdec473..08ceafd 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -35,6 +35,7 @@
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/optimize.pb.h"
#include "icing/proto/persist.pb.h"
#include "icing/proto/reset.pb.h"
@@ -59,6 +60,7 @@
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include "icing/util/timer.h"
#include "unicode/uloc.h"
namespace icing {
@@ -150,6 +152,10 @@ std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
void TransformStatus(const libtextclassifier3::Status& internal_status,
StatusProto* status_proto) {
StatusProto::Code code;
+ if (!internal_status.ok()) {
+ ICING_LOG(WARNING) << "Error: " << internal_status.error_code()
+ << ", Message: " << internal_status.error_message();
+ }
switch (internal_status.CanonicalCode()) {
case libtextclassifier3::StatusCode::OK:
code = StatusProto::OK;
@@ -257,45 +263,101 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() {
ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
<< options_.base_dir();
+ // Measure the latency of the initialization process.
+ Timer initialize_timer;
+
InitializeResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ NativeInitializeStats* initialize_stats =
+ result_proto.mutable_native_initialize_stats();
if (initialized_) {
// Already initialized.
result_status->set_code(StatusProto::OK);
+ initialize_stats->set_latency_ms(initialize_timer.GetElapsedMilliseconds());
+ initialize_stats->set_num_documents(document_store_->num_documents());
return result_proto;
}
// Releases result / query cache if any
result_state_manager_.InvalidateAllResultStates();
- libtextclassifier3::Status status = InitializeMembers();
+ libtextclassifier3::Status status = InitializeMembers(initialize_stats);
if (!status.ok()) {
TransformStatus(status, result_status);
+ initialize_stats->set_latency_ms(initialize_timer.GetElapsedMilliseconds());
return result_proto;
}
// Even if each subcomponent initialized fine independently, we need to
// check if they're consistent with each other.
if (!CheckConsistency().ok()) {
- ICING_VLOG(1)
- << "IcingSearchEngine in inconsistent state, regenerating all "
- "derived data";
- status = RegenerateDerivedFiles();
+ // The total checksum doesn't match the stored value, it could be one of the
+ // following cases:
+ // 1. Icing is initialized the first time in this directory.
+ // 2. Non-checksumed changes have been made to some files.
+ if (index_->last_added_document_id() == kInvalidDocumentId &&
+ document_store_->last_added_document_id() == kInvalidDocumentId &&
+ absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
+ // First time initialize. Not recovering but creating all the files.
+ // We need to explicitly clear the recovery-related fields because some
+ // sub-components may not be able to tell if the storage is being
+ // initialized the first time or has lost some files. Sub-components may
+ // already have set these fields in earlier steps.
+ *initialize_stats = NativeInitializeStats();
+ status = RegenerateDerivedFiles();
+ } else {
+ ICING_VLOG(1)
+ << "IcingSearchEngine in inconsistent state, regenerating all "
+ "derived data";
+ // Total checksum mismatch may not be the root cause of document store
+ // recovery. Preserve the root cause that was set by the document store.
+ bool should_log_document_store_recovery_cause =
+ initialize_stats->document_store_recovery_cause() ==
+ NativeInitializeStats::NONE;
+ if (should_log_document_store_recovery_cause) {
+ initialize_stats->set_document_store_recovery_cause(
+ NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH);
+ }
+ initialize_stats->set_index_restoration_cause(
+ NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH);
+ status = RegenerateDerivedFiles(initialize_stats,
+ should_log_document_store_recovery_cause);
+ }
} else {
- status = RestoreIndex();
+ DocumentId last_stored_document_id =
+ document_store_->last_added_document_id();
+ DocumentId last_indexed_document_id = index_->last_added_document_id();
+ if (last_stored_document_id != last_indexed_document_id) {
+ if (last_stored_document_id == kInvalidDocumentId) {
+ // Document store is empty but index is not. Reset the index.
+ status = index_->Reset();
+ } else {
+ // Index is inconsistent with the document store, we need to restore the
+ // index.
+ initialize_stats->set_index_restoration_cause(
+ NativeInitializeStats::INCONSISTENT_WITH_GROUND_TRUTH);
+ Timer index_restore_timer;
+ status = RestoreIndexIfNeeded();
+ initialize_stats->set_index_restoration_latency_ms(
+ index_restore_timer.GetElapsedMilliseconds());
+ }
+ }
}
if (status.ok() || absl_ports::IsDataLoss(status)) {
initialized_ = true;
}
TransformStatus(status, result_status);
+ initialize_stats->set_latency_ms(initialize_timer.GetElapsedMilliseconds());
return result_proto;
}
-libtextclassifier3::Status IcingSearchEngine::InitializeMembers() {
+libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
+ NativeInitializeStats* initialize_stats) {
+ ICING_RETURN_ERROR_IF_NULL(initialize_stats);
ICING_RETURN_IF_ERROR(InitializeOptions());
- ICING_RETURN_IF_ERROR(InitializeSchemaStore());
- ICING_RETURN_IF_ERROR(InitializeDocumentStore());
+ ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
+ ICING_RETURN_IF_ERROR(InitializeDocumentStore(initialize_stats));
// TODO(b/156383798) : Resolve how to specify the locale.
language_segmenter_factory::SegmenterOptions segmenter_options(
@@ -306,7 +368,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers() {
TC3_ASSIGN_OR_RETURN(normalizer_,
normalizer_factory::Create(options_.max_token_length()));
- ICING_RETURN_IF_ERROR(InitializeIndex());
+ ICING_RETURN_IF_ERROR(InitializeIndex(initialize_stats));
return libtextclassifier3::Status::OK;
}
@@ -323,7 +385,10 @@ libtextclassifier3::Status IcingSearchEngine::InitializeOptions() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore() {
+libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore(
+ NativeInitializeStats* initialize_stats) {
+ ICING_RETURN_ERROR_IF_NULL(initialize_stats);
+
const std::string schema_store_dir =
MakeSchemaDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
@@ -332,12 +397,16 @@ libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore() {
absl_ports::StrCat("Could not create directory: ", schema_store_dir));
}
ICING_ASSIGN_OR_RETURN(
- schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir));
+ schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir,
+ initialize_stats));
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore() {
+libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore(
+ NativeInitializeStats* initialize_stats) {
+ ICING_RETURN_ERROR_IF_NULL(initialize_stats);
+
const std::string document_dir =
MakeDocumentDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
@@ -348,12 +417,15 @@ libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore() {
ICING_ASSIGN_OR_RETURN(
document_store_,
DocumentStore::Create(filesystem_.get(), document_dir, clock_.get(),
- schema_store_.get()));
+ schema_store_.get(), initialize_stats));
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status IcingSearchEngine::InitializeIndex() {
+libtextclassifier3::Status IcingSearchEngine::InitializeIndex(
+ NativeInitializeStats* initialize_stats) {
+ ICING_RETURN_ERROR_IF_NULL(initialize_stats);
+
const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
if (!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
@@ -371,11 +443,18 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex() {
absl_ports::StrCat("Could not recreate directory: ", index_dir));
}
+ initialize_stats->set_index_restoration_cause(
+ NativeInitializeStats::IO_ERROR);
+
// Try recreating it from scratch and re-indexing everything.
ICING_ASSIGN_OR_RETURN(index_,
Index::Create(index_options, filesystem_.get(),
icing_filesystem_.get()));
- ICING_RETURN_IF_ERROR(RestoreIndex());
+
+ Timer restore_timer;
+ ICING_RETURN_IF_ERROR(RestoreIndexIfNeeded());
+ initialize_stats->set_index_restoration_latency_ms(
+ restore_timer.GetElapsedMilliseconds());
} else {
// Index was created fine.
index_ = std::move(index_or).ValueOrDie();
@@ -414,11 +493,25 @@ libtextclassifier3::Status IcingSearchEngine::CheckConsistency() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status IcingSearchEngine::RegenerateDerivedFiles() {
+libtextclassifier3::Status IcingSearchEngine::RegenerateDerivedFiles(
+ NativeInitializeStats* initialize_stats, bool log_document_store_stats) {
+ // Measure the latency of the data recovery. The cause of the recovery should
+ // be logged by the caller.
+ Timer timer;
ICING_RETURN_IF_ERROR(
document_store_->UpdateSchemaStore(schema_store_.get()));
+ if (initialize_stats != nullptr && log_document_store_stats) {
+ initialize_stats->set_document_store_recovery_latency_ms(
+ timer.GetElapsedMilliseconds());
+ }
+ // Restart timer.
+ timer = Timer();
ICING_RETURN_IF_ERROR(index_->Reset());
- ICING_RETURN_IF_ERROR(RestoreIndex());
+ ICING_RETURN_IF_ERROR(RestoreIndexIfNeeded());
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_index_restoration_latency_ms(
+ timer.GetElapsedMilliseconds());
+ }
const std::string header_file =
MakeHeaderFilename(options_.base_dir().c_str());
@@ -513,7 +606,7 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
return result_proto;
}
- status = RestoreIndex();
+ status = RestoreIndexIfNeeded();
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
@@ -580,8 +673,12 @@ PutResultProto IcingSearchEngine::Put(const DocumentProto& document) {
PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
ICING_VLOG(1) << "Writing document to document store";
+ Timer put_timer;
+
PutResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ NativePutDocumentStats* put_document_stats =
+ result_proto.mutable_native_put_document_stats();
// Lock must be acquired before validation because the DocumentStore uses
// the schema file to validate, and the schema could be changed in
@@ -590,12 +687,14 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
+ put_document_stats->set_latency_ms(put_timer.GetElapsedMilliseconds());
return result_proto;
}
- auto document_id_or = document_store_->Put(document);
+ auto document_id_or = document_store_->Put(document, put_document_stats);
if (!document_id_or.ok()) {
TransformStatus(document_id_or.status(), result_status);
+ put_document_stats->set_latency_ms(put_timer.GetElapsedMilliseconds());
return result_proto;
}
DocumentId document_id = document_id_or.ValueOrDie();
@@ -605,13 +704,17 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
index_.get(), CreateIndexProcessorOptions(options_));
if (!index_processor_or.ok()) {
TransformStatus(index_processor_or.status(), result_status);
+ put_document_stats->set_latency_ms(put_timer.GetElapsedMilliseconds());
return result_proto;
}
std::unique_ptr<IndexProcessor> index_processor =
std::move(index_processor_or).ValueOrDie();
- auto status = index_processor->IndexDocument(document, document_id);
+ auto status =
+ index_processor->IndexDocument(document, document_id, put_document_stats);
+
TransformStatus(status, result_status);
+ put_document_stats->set_latency_ms(put_timer.GetElapsedMilliseconds());
return result_proto;
}
@@ -889,7 +992,7 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
return result_proto;
}
- libtextclassifier3::Status index_restoration_status = RestoreIndex();
+ libtextclassifier3::Status index_restoration_status = RestoreIndexIfNeeded();
if (!index_restoration_status.ok()) {
status = absl_ports::Annotate(
absl_ports::InternalError(
@@ -1300,19 +1403,21 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status IcingSearchEngine::RestoreIndex() {
+libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() {
DocumentId last_stored_document_id =
document_store_->last_added_document_id();
DocumentId last_indexed_document_id = index_->last_added_document_id();
- if (last_stored_document_id == kInvalidDocumentId) {
- // Nothing to index. Make sure the index is also empty.
- if (last_indexed_document_id != kInvalidDocumentId) {
- ICING_RETURN_IF_ERROR(index_->Reset());
- }
+ if (last_stored_document_id == last_indexed_document_id) {
+ // No need to recover.
return libtextclassifier3::Status::OK;
}
+ if (last_stored_document_id == kInvalidDocumentId) {
+ // Document store is empty but index is not. Reset the index.
+ return index_->Reset();
+ }
+
// TruncateTo ensures that the index does not hold any data that is not
// present in the ground truth. If the document store lost some documents,
// TruncateTo will ensure that the index does not contain any hits from those
@@ -1323,7 +1428,7 @@ libtextclassifier3::Status IcingSearchEngine::RestoreIndex() {
DocumentId first_document_to_reindex =
(last_indexed_document_id != kInvalidDocumentId)
? index_->last_added_document_id() + 1
- : 0;
+ : kMinDocumentId;
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<IndexProcessor> index_processor,
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 58b8df2..70a9c07 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -451,10 +451,12 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if initialize_stats is null
// RESOURCE_EXHAUSTED if the index runs out of storage
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL on any I/O errors
- libtextclassifier3::Status InitializeMembers()
+ libtextclassifier3::Status InitializeMembers(
+ NativeInitializeStats* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any validation/setup required for the given IcingSearchEngineOptions
@@ -470,8 +472,10 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if initialize_stats is null
// INTERNAL on I/O error
- libtextclassifier3::Status InitializeSchemaStore()
+ libtextclassifier3::Status InitializeSchemaStore(
+ NativeInitializeStats* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any initialization/recovery necessary to create a DocumentStore
@@ -479,8 +483,10 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if initialize_stats is null
// INTERNAL on I/O error
- libtextclassifier3::Status InitializeDocumentStore()
+ libtextclassifier3::Status InitializeDocumentStore(
+ NativeInitializeStats* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any initialization/recovery necessary to create a DocumentStore
@@ -488,10 +494,12 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if initialize_stats is null
// RESOURCE_EXHAUSTED if the index runs out of storage
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL on I/O error
- libtextclassifier3::Status InitializeIndex()
+ libtextclassifier3::Status InitializeIndex(
+ NativeInitializeStats* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Many of the internal components rely on other components' derived data.
@@ -514,7 +522,9 @@ class IcingSearchEngine {
// Returns:
// OK on success
// INTERNAL_ERROR on any IO errors
- libtextclassifier3::Status RegenerateDerivedFiles()
+ libtextclassifier3::Status RegenerateDerivedFiles(
+ NativeInitializeStats* initialize_stats = nullptr,
+ bool log_document_store_stats = false)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Optimizes the DocumentStore by removing any unneeded documents (i.e.
@@ -543,7 +553,7 @@ class IcingSearchEngine {
// RESOURCE_EXHAUSTED if the index fills up before finishing indexing
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL_ERROR on any IO errors
- libtextclassifier3::Status RestoreIndex()
+ libtextclassifier3::Status RestoreIndexIfNeeded()
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Computes the combined checksum of the IcingSearchEngine - includes all its
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index d31f836..1f59c6e 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -44,9 +44,10 @@ SchemaProto SetTypes() {
body->set_property_name("body");
body->set_data_type(PropertyConfigProto::DataType::STRING);
body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
return schema;
}
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 06e89f2..b642a94 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -41,6 +41,7 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/random-string.h"
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -161,9 +162,10 @@ SchemaProto CreateMessageSchema() {
body->set_property_name("body");
body->set_data_type(PropertyConfigProto::DataType::STRING);
body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
return schema;
}
@@ -177,16 +179,18 @@ SchemaProto CreateEmailSchema() {
body->set_property_name("body");
body->set_data_type(PropertyConfigProto::DataType::STRING);
body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
auto* subj = type->add_properties();
subj->set_property_name("subject");
subj->set_data_type(PropertyConfigProto::DataType::STRING);
subj->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- subj->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- subj->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ subj->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ subj->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
return schema;
}
@@ -423,10 +427,7 @@ TEST_F(IcingSearchEngineTest,
body->set_schema_type("Person");
body->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_document_indexing_config()->set_index_nested_properties(true);
type = schema.add_types();
type->set_schema_type("Person");
@@ -436,10 +437,7 @@ TEST_F(IcingSearchEngineTest,
body->set_schema_type("Message");
body->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_document_indexing_config()->set_index_nested_properties(true);
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
@@ -735,7 +733,7 @@ TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) {
SchemaProto schema_with_no_indexed_property = CreateMessageSchema();
schema_with_no_indexed_property.mutable_types(0)
->mutable_properties(0)
- ->clear_indexing_config();
+ ->clear_string_indexing_config();
EXPECT_THAT(icing.SetSchema(schema_with_no_indexed_property).status(),
ProtoIsOk());
@@ -1775,10 +1773,10 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
property->set_property_name("subject");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
// Add an message type
type = schema.add_types();
type->set_schema_type("message");
@@ -1786,10 +1784,10 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
property->set_property_name("body");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
DocumentProto document1 =
DocumentBuilder()
.SetKey("namespace1", "uri1")
@@ -2205,20 +2203,20 @@ TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) {
new_property2->set_property_name("property2");
new_property2->set_data_type(PropertyConfigProto::DataType::STRING);
new_property2->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- new_property2->mutable_indexing_config()->set_term_match_type(
+ new_property2->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::PREFIX);
- new_property2->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ new_property2->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
SchemaProto schema3 = SchemaProto(schema2);
auto new_property3 = schema3.mutable_types(0)->add_properties();
new_property3->set_property_name("property3");
new_property3->set_data_type(PropertyConfigProto::DataType::STRING);
new_property3->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- new_property3->mutable_indexing_config()->set_term_match_type(
+ new_property3->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::PREFIX);
- new_property3->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ new_property3->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
{
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
@@ -2463,9 +2461,10 @@ TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) {
body->set_property_name("body");
body->set_data_type(PropertyConfigProto::DataType::STRING);
body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
DocumentProto document = DocumentBuilder()
.SetKey("namespace", "uri")
@@ -2513,9 +2512,10 @@ TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) {
body->set_property_name("body");
body->set_data_type(PropertyConfigProto::DataType::STRING);
body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
DocumentProto document = DocumentBuilder()
.SetKey("namespace", "uri")
@@ -2613,10 +2613,10 @@ TEST_F(IcingSearchEngineTest, SearchWorksAfterSchemaTypesCompatiblyModified) {
property->set_property_name("body");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::PREFIX);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
@@ -2869,10 +2869,10 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) {
property->set_property_name("body");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::PREFIX);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
property = type->add_properties();
property->set_property_name("additional");
@@ -2916,19 +2916,19 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) {
property->set_property_name("body");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::PREFIX);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
property = type->add_properties();
property->set_property_name("additional");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::PREFIX);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
@@ -3705,9 +3705,10 @@ TEST_F(IcingSearchEngineTest, SetSchemaCanDetectPreviousSchemaWasLost) {
body->set_property_name("body");
body->set_data_type(PropertyConfigProto::DataType::STRING);
body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
// Make an incompatible schema, a previously OPTIONAL field is REQUIRED
SchemaProto incompatible_schema = schema;
@@ -4218,10 +4219,10 @@ TEST_F(IcingSearchEngineTest, Hyphens) {
prop->set_property_name("foo");
prop->set_data_type(PropertyConfigProto::DataType::STRING);
prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- prop->mutable_indexing_config()->set_term_match_type(
+ prop->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ prop->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
DocumentProto document_one =
@@ -4281,7 +4282,7 @@ TEST_F(IcingSearchEngineTest, RestoreIndex) {
EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
}
- // 2. Delete the index file to trigger RestoreIndex.
+ // 2. Delete the index file to trigger RestoreIndexIfNeeded.
std::string idx_subdir = GetIndexDir() + "/idx";
filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str());
@@ -4474,7 +4475,7 @@ TEST_F(IcingSearchEngineTest, IndexingDocMergeFailureResets) {
EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
}
- // 2. Delete the index file to trigger RestoreIndex.
+ // 2. Delete the index file to trigger RestoreIndexIfNeeded.
std::string idx_subdir = GetIndexDir() + "/idx";
filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str());
@@ -4522,6 +4523,691 @@ TEST_F(IcingSearchEngineTest, IndexingDocMergeFailureResets) {
}
}
+TEST_F(IcingSearchEngineTest, InitializeShouldLogFunctionLatency) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats().latency_ms(),
+ Gt(0));
+}
+
+TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfDocuments) {
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.native_initialize_stats().num_documents(),
+ Eq(0));
+
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.native_initialize_stats().num_documents(),
+ Eq(1));
+
+ // Put another document.
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.native_initialize_stats().num_documents(),
+ Eq(2));
+ }
+}
+
+TEST_F(IcingSearchEngineTest,
+ InitializeShouldNotLogRecoveryCauseForFirstTimeInitialize) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_data_status(),
+ Eq(NativeInitializeStats::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+}
+
+TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCausePartialDataLoss) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ {
+ // Append a non-checksummed document. This will mess up the checksum of the
+ // proto log, forcing it to rewind and later return a DATA_LOSS error.
+ const std::string serialized_document = document.SerializeAsString();
+ const std::string document_log_file =
+ absl_ports::StrCat(GetDocumentDir(), "/document_log");
+
+ int64_t file_size = filesystem()->GetFileSize(document_log_file.c_str());
+ filesystem()->PWrite(document_log_file.c_str(), file_size,
+ serialized_document.data(),
+ serialized_document.size());
+ }
+
+ {
+ // Document store will rewind to previous checkpoint. The cause should be
+ // DATA_LOSS and the data status should be PARTIAL_LOSS.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(NativeInitializeStats::DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_data_status(),
+ Eq(NativeInitializeStats::PARTIAL_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineTest,
+ InitializeShouldLogRecoveryCauseCompleteDataLoss) {
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .Build();
+
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ }
+
+ {
+ // Modify the document log checksum to trigger a complete document log
+ // rewind.
+ const std::string document_log_file =
+ absl_ports::StrCat(GetDocumentDir(), "/document_log");
+
+ FileBackedProtoLog<DocumentWrapper>::Header document_log_header;
+ filesystem()->PRead(document_log_file.c_str(), &document_log_header,
+ sizeof(FileBackedProtoLog<DocumentWrapper>::Header),
+ /*offset=*/0);
+ // Set a garbage checksum.
+ document_log_header.log_checksum = 10;
+ document_log_header.header_checksum =
+ document_log_header.CalculateHeaderChecksum();
+ filesystem()->PWrite(document_log_file.c_str(), /*offset=*/0,
+ &document_log_header,
+ sizeof(FileBackedProtoLog<DocumentWrapper>::Header));
+ }
+
+ {
+ // Document store will completely rewind. The cause should be DATA_LOSS and
+ // the data status should be COMPLETE_LOSS.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(NativeInitializeStats::DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_data_status(),
+ Eq(NativeInitializeStats::COMPLETE_LOSS));
+ // The complete rewind of ground truth causes the mismatch of total
+ // checksum, so index should be restored.
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_cause(),
+ Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH));
+ // Here we don't check index_restoration_latency_ms because the index
+ // restoration is super fast when document store is emtpy. We won't get a
+ // latency that is greater than 1 ms.
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineTest,
+ InitializeShouldLogRecoveryCauseInconsistentWithGroundTruth) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ {
+ // Delete the index file to trigger RestoreIndexIfNeeded.
+ std::string idx_subdir = GetIndexDir() + "/idx";
+ filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str());
+ }
+
+ {
+ // Index is empty but ground truth is not. Index should be restored due to
+ // the inconsistency.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_cause(),
+ Eq(NativeInitializeStats::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_data_status(),
+ Eq(NativeInitializeStats::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineTest,
+ InitializeShouldLogRecoveryCauseTotalChecksumMismatch) {
+ {
+ // Initialize and index some documents.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // We need to index enough documents to make
+ // DocumentStore::UpdateSchemaStore() run longer than 1 ms.
+ for (int i = 0; i < 50; ++i) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/" + std::to_string(i))
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+ }
+
+ {
+ // Change the header's checksum value to a random value.
+ uint32_t invalid_checksum = 1;
+ filesystem()->PWrite(GetHeaderFilename().c_str(),
+ offsetof(IcingSearchEngine::Header, checksum),
+ &invalid_checksum, sizeof(invalid_checksum));
+ }
+
+ {
+ // Both document store and index should be recovered from checksum mismatch.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_cause(),
+ Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_data_status(),
+ Eq(NativeInitializeStats::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCauseIndexIOError) {
+ {
+ // Initialize and index some documents.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // We need to index enough documents to make RestoreIndexIfNeeded() run
+ // longer than 1 ms.
+ for (int i = 0; i < 50; ++i) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/" + std::to_string(i))
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+ }
+
+ // lambda to fail OpenForWrite on lite index hit buffer once.
+ bool has_failed_already = false;
+ auto open_write_lambda = [this, &has_failed_already](const char* filename) {
+ std::string lite_index_buffer_file_path =
+ absl_ports::StrCat(GetIndexDir(), "/idx/lite.hb");
+ std::string filename_string(filename);
+ if (!has_failed_already && filename_string == lite_index_buffer_file_path) {
+ has_failed_already = true;
+ return -1;
+ }
+ return this->filesystem()->OpenForWrite(filename);
+ };
+
+ auto mock_icing_filesystem = std::make_unique<IcingMockFilesystem>();
+ // This fails Index::Create() once.
+ ON_CALL(*mock_icing_filesystem, OpenForWrite)
+ .WillByDefault(open_write_lambda);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::move(mock_icing_filesystem),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_cause(),
+ Eq(NativeInitializeStats::IO_ERROR));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_data_status(),
+ Eq(NativeInitializeStats::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+}
+
+TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCauseDocStoreIOError) {
+ {
+ // Initialize and index some documents.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // We need to index enough documents to make RestoreIndexIfNeeded() run
+ // longer than 1 ms.
+ for (int i = 0; i < 50; ++i) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/" + std::to_string(i))
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+ }
+
+ // lambda to fail Read on document store header once.
+ bool has_failed_already = false;
+ auto read_lambda = [this, &has_failed_already](const char* filename,
+ void* buf, size_t buf_size) {
+ std::string document_store_header_file_path =
+ absl_ports::StrCat(GetDocumentDir(), "/document_store_header");
+ std::string filename_string(filename);
+ if (!has_failed_already &&
+ filename_string == document_store_header_file_path) {
+ has_failed_already = true;
+ return false;
+ }
+ return this->filesystem()->Read(filename, buf, buf_size);
+ };
+
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ // This fails DocumentStore::InitializeDerivedFiles() once.
+ ON_CALL(*mock_filesystem, Read(A<const char*>(), _, _))
+ .WillByDefault(read_lambda);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(NativeInitializeStats::IO_ERROR));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_data_status(),
+ Eq(NativeInitializeStats::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+}
+
+TEST_F(IcingSearchEngineTest,
+ InitializeShouldLogRecoveryCauseSchemaStoreIOError) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ }
+
+ {
+ // Delete the schema store header file to trigger an I/O error.
+ std::string schema_store_header_file_path =
+ GetSchemaDir() + "/schema_store_header";
+ filesystem()->DeleteFile(schema_store_header_file_path.c_str());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(NativeInitializeStats::IO_ERROR));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .document_store_data_status(),
+ Eq(NativeInitializeStats::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_cause(),
+ Eq(NativeInitializeStats::NONE));
+ EXPECT_THAT(initialize_result_proto.native_initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfSchemaTypes) {
+ {
+ // Initialize an empty storage.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ // There should be 0 schema types.
+ EXPECT_THAT(
+ initialize_result_proto.native_initialize_stats().num_schema_types(),
+ Eq(0));
+
+ // Set a schema with one type config.
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ // There should be 1 schema type.
+ EXPECT_THAT(
+ initialize_result_proto.native_initialize_stats().num_schema_types(),
+ Eq(1));
+
+ // Create and set a schema with two type configs: Email and Message.
+ SchemaProto schema = CreateEmailSchema();
+
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+ auto body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.native_initialize_stats().num_schema_types(),
+ Eq(2));
+ }
+}
+
+TEST_F(IcingSearchEngineTest, PutDocumentShouldLogFunctionLatency) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(put_result_proto.native_put_document_stats().latency_ms(), Gt(0));
+}
+
+TEST_F(IcingSearchEngineTest, PutDocumentShouldLogDocumentStoreStats) {
+ // Create a large enough document so that document_store_latency_ms can be
+ // longer than 1 ms.
+ std::default_random_engine random;
+ std::string random_string_10000 =
+ RandomString(kAlNumAlphabet, /*len=*/10000, &random);
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", random_string_10000)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ put_result_proto.native_put_document_stats().document_store_latency_ms(),
+ Gt(0));
+ EXPECT_THAT(put_result_proto.native_put_document_stats().document_size(),
+ Eq(document.ByteSizeLong()));
+}
+
+TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexingStats) {
+ // Create a large enough document so that index_latency_ms can be longer than
+ // 1 ms.
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(put_result_proto.native_put_document_stats().index_latency_ms(),
+ Gt(0));
+ // No merge should happen.
+ EXPECT_THAT(
+ put_result_proto.native_put_document_stats().index_merge_latency_ms(),
+ Eq(0));
+ // Number of tokens should not exceed.
+ EXPECT_FALSE(put_result_proto.native_put_document_stats()
+ .tokenization_stats()
+ .exceeded_max_token_num());
+ // kIpsumText has 137 tokens.
+ EXPECT_THAT(put_result_proto.native_put_document_stats()
+ .tokenization_stats()
+ .num_tokens_indexed(),
+ Eq(137));
+}
+
+TEST_F(IcingSearchEngineTest, PutDocumentShouldLogWhetherNumTokensExceeds) {
+ // Create a document with 2 tokens.
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+
+ // Create an icing instance with max_tokens_per_doc = 1.
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+ icing_options.set_max_tokens_per_doc(1);
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ // Number of tokens(2) exceeds the max allowed value(1).
+ EXPECT_TRUE(put_result_proto.native_put_document_stats()
+ .tokenization_stats()
+ .exceeded_max_token_num());
+ EXPECT_THAT(put_result_proto.native_put_document_stats()
+ .tokenization_stats()
+ .num_tokens_indexed(),
+ Eq(1));
+}
+
+TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexMergeLatency) {
+ // Create 2 large enough documents so that index_merge_latency_ms can be
+ // longer than 1 ms.
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .Build();
+
+ // Create an icing instance with index_merge_size = document1's size.
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+ icing_options.set_index_merge_size(document1.ByteSizeLong());
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // Putting document2 should trigger an index merge.
+ PutResultProto put_result_proto = icing.Put(document2);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ put_result_proto.native_put_document_stats().index_merge_latency_ms(),
+ Gt(0));
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 47111ad..9e57993 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -37,6 +37,7 @@
#include "icing/tokenization/tokenizer.h"
#include "icing/transform/normalizer.h"
#include "icing/util/status-macros.h"
+#include "icing/util/timer.h"
namespace icing {
namespace lib {
@@ -56,7 +57,10 @@ IndexProcessor::Create(const SchemaStore* schema_store,
}
libtextclassifier3::Status IndexProcessor::IndexDocument(
- const DocumentProto& document, DocumentId document_id) {
+ const DocumentProto& document, DocumentId document_id,
+ NativePutDocumentStats* put_document_stats) {
+ Timer index_timer;
+
if (index_->last_added_document_id() != kInvalidDocumentId &&
document_id <= index_->last_added_document_id()) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
@@ -80,6 +84,12 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
tokenizer->Tokenize(subcontent));
while (itr->Advance()) {
if (++num_tokens > options_.max_tokens_per_document) {
+ if (put_document_stats != nullptr) {
+ put_document_stats->mutable_tokenization_stats()
+ ->set_exceeded_max_token_num(true);
+ put_document_stats->mutable_tokenization_stats()
+ ->set_num_tokens_indexed(options_.max_tokens_per_document);
+ }
switch (options_.token_limit_behavior) {
case Options::TokenLimitBehavior::kReturnError:
return absl_ports::ResourceExhaustedError(
@@ -106,10 +116,20 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
}
}
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_index_latency_ms(
+ index_timer.GetElapsedMilliseconds());
+ put_document_stats->mutable_tokenization_stats()->set_num_tokens_indexed(
+ num_tokens);
+ }
+
// Merge if necessary.
if (overall_status.ok() && index_->WantsMerge()) {
ICING_VLOG(1) << "Merging the index at docid " << document_id << ".";
+
+ Timer merge_timer;
libtextclassifier3::Status merge_status = index_->Merge();
+
if (!merge_status.ok()) {
ICING_LOG(ERROR) << "Index merging failed. Clearing index.";
if (!index_->Reset().ok()) {
@@ -123,6 +143,11 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
merge_status.error_code(), merge_status.error_message().c_str()));
}
}
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_index_merge_latency_ms(
+ merge_timer.GetElapsedMilliseconds());
+ }
}
return overall_status;
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
index 083efea..91719d0 100644
--- a/icing/index/index-processor.h
+++ b/icing/index/index-processor.h
@@ -69,6 +69,9 @@ class IndexProcessor {
// Indexing a document *may* trigger an index merge. If a merge fails, then
// all content in the index will be lost.
//
+ // If put_document_stats is present, the fields related to indexing will be
+ // populated.
+ //
// Returns:
// INVALID_ARGUMENT if document_id is less than the document_id of a
// previously indexed document or tokenization fails.
@@ -77,8 +80,9 @@ class IndexProcessor {
// cleared as a result.
// NOT_FOUND if there is no definition for the document's schema type.
// INTERNAL_ERROR if any other errors occur
- libtextclassifier3::Status IndexDocument(const DocumentProto& document,
- DocumentId document_id);
+ libtextclassifier3::Status IndexDocument(
+ const DocumentProto& document, DocumentId document_id,
+ NativePutDocumentStats* put_document_stats = nullptr);
private:
IndexProcessor(const SchemaStore* schema_store,
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index a9b298e..584cb9b 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -76,10 +76,10 @@ void CreateFakeTypeConfig(SchemaTypeConfigProto* type_config) {
IcingStringUtil::StringPrintf("p%d", i)); // p0 - p9
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
}
}
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 84c822b..e193842 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -166,9 +166,10 @@ class IndexProcessorTest : public Test {
prop->set_property_name(std::string(name));
prop->set_data_type(type);
prop->set_cardinality(cardinality);
- prop->mutable_indexing_config()->set_term_match_type(term_match_type);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ prop->mutable_string_indexing_config()->set_term_match_type(
+ term_match_type);
+ prop->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
}
static void AddNonIndexedProperty(std::string_view name, DataType::Code type,
@@ -210,6 +211,7 @@ class IndexProcessorTest : public Test {
prop->set_data_type(DataType::DOCUMENT);
prop->set_cardinality(Cardinality::OPTIONAL);
prop->set_schema_type(std::string(kNestedType));
+ prop->mutable_document_indexing_config()->set_index_nested_properties(true);
// Add nested type
type_config = schema.add_types();
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
index df79c6d..b29217c 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
@@ -64,10 +64,10 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
property->set_property_name(indexed_property_);
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
// First and only indexed property, so it gets the first id of 0
indexed_section_id_ = 0;
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc
index b3696b7..a60764d 100644
--- a/icing/index/main/doc-hit-info-iterator-term-main.cc
+++ b/icing/index/main/doc-hit-info-iterator-term-main.cc
@@ -54,8 +54,12 @@ libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() {
// next posting list in the chain.
libtextclassifier3::Status status = RetrieveMoreHits();
if (!status.ok()) {
- ICING_LOG(ERROR) << "Failed to retrieve more hits "
- << status.error_message();
+ if (!absl_ports::IsNotFound(status)) {
+ // NOT_FOUND is expected to happen (not every term will be in the main
+ // index!). Other errors are worth logging.
+ ICING_LOG(ERROR) << "Failed to retrieve more hits "
+ << status.error_message();
+ }
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index 2edc624..16bd120 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -70,10 +70,10 @@ void AddIndexedProperty(SchemaTypeConfigProto* type_config, std::string name) {
property_config->set_property_name(name);
property_config->set_data_type(PropertyConfigProto::DataType::STRING);
property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property_config->mutable_indexing_config()->set_term_match_type(
+ property_config->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property_config->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
}
void AddUnindexedProperty(SchemaTypeConfigProto* type_config,
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 0d2c2c5..1d1f824 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -77,18 +77,18 @@ class ResultRetrieverTest : public testing::Test {
prop_config->set_property_name("subject");
prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- prop_config->mutable_indexing_config()->set_term_match_type(
+ prop_config->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::PREFIX);
- prop_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ prop_config->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
prop_config = type_config->add_properties();
prop_config->set_property_name("body");
prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- prop_config->mutable_indexing_config()->set_term_match_type(
+ prop_config->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- prop_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ prop_config->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
}
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 676ea92..e552cf2 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -76,18 +76,18 @@ class SnippetRetrieverTest : public testing::Test {
prop_config->set_property_name("subject");
prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- prop_config->mutable_indexing_config()->set_term_match_type(
+ prop_config->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::PREFIX);
- prop_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ prop_config->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
prop_config = type_config->add_properties();
prop_config->set_property_name("body");
prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- prop_config->mutable_indexing_config()->set_term_match_type(
+ prop_config->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- prop_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ prop_config->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
ICING_ASSERT_OK(schema_store_->SetSchema(schema));
ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index 34ccf22..9173031 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -39,6 +39,7 @@
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include "icing/util/timer.h"
namespace icing {
namespace lib {
@@ -103,12 +104,13 @@ std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
} // namespace
libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
- const Filesystem* filesystem, const std::string& base_dir) {
+ const Filesystem* filesystem, const std::string& base_dir,
+ NativeInitializeStats* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(filesystem);
std::unique_ptr<SchemaStore> schema_store =
std::unique_ptr<SchemaStore>(new SchemaStore(filesystem, base_dir));
- ICING_RETURN_IF_ERROR(schema_store->Initialize());
+ ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
return schema_store;
}
@@ -125,7 +127,8 @@ SchemaStore::~SchemaStore() {
}
}
-libtextclassifier3::Status SchemaStore::Initialize() {
+libtextclassifier3::Status SchemaStore::Initialize(
+ NativeInitializeStats* initialize_stats) {
auto schema_proto_or = GetSchema();
if (absl_ports::IsNotFound(schema_proto_or.status())) {
// Don't have an existing schema proto, that's fine
@@ -139,10 +142,22 @@ libtextclassifier3::Status SchemaStore::Initialize() {
ICING_VLOG(3)
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for SchemaStore.";
+ Timer regenerate_timer;
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_schema_store_recovery_cause(
+ NativeInitializeStats::IO_ERROR);
+ }
ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_schema_store_recovery_latency_ms(
+ regenerate_timer.GetElapsedMilliseconds());
+ }
}
initialized_ = true;
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_num_schema_types(type_config_map_.size());
+ }
return libtextclassifier3::Status::OK;
}
diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h
index f5c6588..76f36b4 100644
--- a/icing/schema/schema-store.h
+++ b/icing/schema/schema-store.h
@@ -27,6 +27,7 @@
#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
#include "icing/proto/document.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/schema/schema-util.h"
#include "icing/schema/section-manager.h"
@@ -104,12 +105,16 @@ class SchemaStore {
// outlive the created SchemaStore instance. The base_dir must already exist.
// There does not need to be an existing schema already.
//
+ // If initialize_stats is present, the fields related to SchemaStore will be
+ // populated.
+ //
// Returns:
// A SchemaStore on success
// FAILED_PRECONDITION on any null pointer input
// INTERNAL_ERROR on any IO errors
static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
- const Filesystem* filesystem, const std::string& base_dir);
+ const Filesystem* filesystem, const std::string& base_dir,
+ NativeInitializeStats* initialize_stats = nullptr);
// Not copyable
SchemaStore(const SchemaStore&) = delete;
@@ -229,7 +234,8 @@ class SchemaStore {
// Returns:
// OK on success
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status Initialize();
+ libtextclassifier3::Status Initialize(
+ NativeInitializeStats* initialize_stats);
// Creates sub-components and verifies the integrity of each sub-component.
//
diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc
index 957fd89..4a458b2 100644
--- a/icing/schema/schema-store_test.cc
+++ b/icing/schema/schema-store_test.cc
@@ -56,10 +56,10 @@ class SchemaStoreTest : public ::testing::Test {
property->set_property_name("subject");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
}
void TearDown() override {
@@ -444,10 +444,10 @@ TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) {
// Make a previously unindexed property indexed
property = schema.mutable_types(0)->mutable_properties(0);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
// With a new indexed property, we'll need to reindex
result.index_incompatible = true;
diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc
index 12f7c4c..a755e88 100644
--- a/icing/schema/schema-util.cc
+++ b/icing/schema/schema-util.cc
@@ -37,10 +37,6 @@ namespace lib {
namespace {
-// Data types that can be indexed. This follows rule 11 of SchemaUtil::Validate
-static std::unordered_set<PropertyConfigProto::DataType::Code>
- kIndexableDataTypes = {PropertyConfigProto::DataType::STRING};
-
bool IsCardinalityCompatible(const PropertyConfigProto& old_property,
const PropertyConfigProto& new_property) {
if (old_property.cardinality() < new_property.cardinality()) {
@@ -91,8 +87,8 @@ bool IsPropertyCompatible(const PropertyConfigProto& old_property,
IsCardinalityCompatible(old_property, new_property);
}
-bool IsTermMatchTypeCompatible(const IndexingConfig& old_indexed,
- const IndexingConfig& new_indexed) {
+bool IsTermMatchTypeCompatible(const StringIndexingConfig& old_indexed,
+ const StringIndexingConfig& new_indexed) {
return old_indexed.term_match_type() == new_indexed.term_match_type() &&
old_indexed.tokenizer_type() == new_indexed.tokenizer_type();
}
@@ -162,9 +158,11 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) {
ICING_RETURN_IF_ERROR(ValidateCardinality(property_config.cardinality(),
schema_type, property_name));
- ICING_RETURN_IF_ERROR(
- ValidateIndexingConfig(property_config.indexing_config(), data_type,
- schema_type, property_name));
+ if (data_type == PropertyConfigProto::DataType::STRING) {
+ ICING_RETURN_IF_ERROR(ValidateStringIndexingConfig(
+ property_config.string_indexing_config(), data_type, schema_type,
+ property_name));
+ }
}
}
@@ -239,33 +237,26 @@ libtextclassifier3::Status SchemaUtil::ValidateCardinality(
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status SchemaUtil::ValidateIndexingConfig(
- const IndexingConfig& config, PropertyConfigProto::DataType::Code data_type,
- std::string_view schema_type, std::string_view property_name) {
+libtextclassifier3::Status SchemaUtil::ValidateStringIndexingConfig(
+ const StringIndexingConfig& config,
+ PropertyConfigProto::DataType::Code data_type, std::string_view schema_type,
+ std::string_view property_name) {
if (config.term_match_type() == TermMatchType::UNKNOWN &&
- config.tokenizer_type() != IndexingConfig::TokenizerType::NONE) {
+ config.tokenizer_type() != StringIndexingConfig::TokenizerType::NONE) {
// They set a tokenizer type, but no term match type.
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
- "Indexed property '", schema_type, ".", property_name,
+ "Indexed string property '", schema_type, ".", property_name,
"' cannot have a term match type UNKNOWN"));
}
if (config.term_match_type() != TermMatchType::UNKNOWN &&
- config.tokenizer_type() == IndexingConfig::TokenizerType::NONE) {
+ config.tokenizer_type() == StringIndexingConfig::TokenizerType::NONE) {
// They set a term match type, but no tokenizer type
return absl_ports::InvalidArgumentError(
- absl_ports::StrCat("Indexed property '", property_name,
+ absl_ports::StrCat("Indexed string property '", property_name,
"' cannot have a tokenizer type of NONE"));
}
- if (config.term_match_type() != TermMatchType::UNKNOWN &&
- kIndexableDataTypes.find(data_type) == kIndexableDataTypes.end()) {
- // They want this section indexed, but it's not an indexable data type.
- return absl_ports::InvalidArgumentError(absl_ports::StrCat(
- "Cannot index non-string data type for schema property '", schema_type,
- ".", property_name, "'"));
- }
-
return libtextclassifier3::Status::OK;
}
@@ -293,7 +284,7 @@ SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs(
// A non-default term_match_type indicates that this property is meant to be
// indexed.
- if (property_config.indexing_config().term_match_type() !=
+ if (property_config.string_indexing_config().term_match_type() !=
TermMatchType::UNKNOWN) {
parsed_property_configs.num_indexed_properties++;
}
@@ -368,14 +359,15 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
// A non-default term_match_type indicates that this property is meant to
// be indexed.
- if (old_property_config.indexing_config().term_match_type() !=
+ if (old_property_config.string_indexing_config().term_match_type() !=
TermMatchType::UNKNOWN) {
++old_indexed_properties;
}
// Any change in the indexed property requires a reindexing
- if (!IsTermMatchTypeCompatible(old_property_config.indexing_config(),
- new_property_config->indexing_config())) {
+ if (!IsTermMatchTypeCompatible(
+ old_property_config.string_indexing_config(),
+ new_property_config->string_indexing_config())) {
schema_delta.index_incompatible = true;
}
}
diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h
index dfa3aa2..ccb2eea 100644
--- a/icing/schema/schema-util.h
+++ b/icing/schema/schema-util.h
@@ -80,7 +80,7 @@ class SchemaUtil {
// 9. PropertyConfigProtos.schema_type's must correspond to a
// SchemaTypeConfigProto.schema_type
// 10. Property names can only be alphanumeric.
- // 11. Only STRING data types are indexed
+ // 11. Any STRING data types have a valid string_indexing_config
//
// Returns:
// ALREADY_EXISTS for case 1 and 2
@@ -137,6 +137,20 @@ class SchemaUtil {
static const SchemaDelta ComputeCompatibilityDelta(
const SchemaProto& old_schema, const SchemaProto& new_schema);
+ // Validates the 'property_name' field.
+ // 1. Can't be an empty string
+ // 2. Can only contain alphanumeric characters
+ //
+ // NOTE: schema_type is only used for logging. It is not necessary to populate
+ // it.
+ //
+ // RETURNS:
+ // - OK if property_name is valid
+ // - INVALID_ARGUMENT if property name is empty or contains an
+ // non-alphabetic character.
+ static libtextclassifier3::Status ValidatePropertyName(
+ std::string_view property_name, std::string_view schema_type = "");
+
private:
// Validates the 'schema_type' field
//
@@ -146,16 +160,6 @@ class SchemaUtil {
static libtextclassifier3::Status ValidateSchemaType(
std::string_view schema_type);
- // Validates the 'property_name' field.
- // 1. Can't be an empty string
- // 2. Can only contain alphanumeric characters
- //
- // Returns:
- // INVALID_ARGUMENT if any of the rules are not followed
- // OK on success
- static libtextclassifier3::Status ValidatePropertyName(
- std::string_view property_name, std::string_view schema_type);
-
// Validates the 'data_type' field.
//
// Returns:
@@ -174,15 +178,15 @@ class SchemaUtil {
PropertyConfigProto::Cardinality::Code cardinality,
std::string_view schema_type, std::string_view property_name);
- // Checks that the 'indexing_config' satisfies the following rules:
+ // Checks that the 'string_indexing_config' satisfies the following rules:
// 1. Only STRING data types can be indexed
// 2. An indexed property must have a valid tokenizer type
//
// Returns:
// INVALID_ARGUMENT if any of the rules are not followed
// OK on success
- static libtextclassifier3::Status ValidateIndexingConfig(
- const IndexingConfig& config,
+ static libtextclassifier3::Status ValidateStringIndexingConfig(
+ const StringIndexingConfig& config,
PropertyConfigProto::DataType::Code data_type,
std::string_view schema_type, std::string_view property_name);
};
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
index 6012989..ed3bde7 100644
--- a/icing/schema/schema-util_test.cc
+++ b/icing/schema/schema-util_test.cc
@@ -486,17 +486,17 @@ TEST_F(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) {
schema_delta.index_incompatible = true;
// New schema gained a new indexed property.
- old_property->mutable_indexing_config()->set_term_match_type(
+ old_property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::UNKNOWN);
- new_property->mutable_indexing_config()->set_term_match_type(
+ new_property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
Eq(schema_delta));
// New schema lost an indexed property.
- old_property->mutable_indexing_config()->set_term_match_type(
+ old_property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- new_property->mutable_indexing_config()->set_term_match_type(
+ new_property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::UNKNOWN);
EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
Eq(schema_delta));
@@ -527,7 +527,7 @@ TEST_F(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) {
new_property->set_property_name("NewIndexedProperty");
new_property->set_data_type(PropertyConfigProto::DataType::STRING);
new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- new_property->mutable_indexing_config()->set_term_match_type(
+ new_property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
SchemaUtil::SchemaDelta schema_delta;
@@ -583,15 +583,15 @@ TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) {
prop->set_property_name("Foo");
prop->set_data_type(PropertyConfigProto::DataType::STRING);
prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ prop->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
// Error if we don't set a term match type
EXPECT_THAT(SchemaUtil::Validate(schema),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
// Passes once we set a term match type
- prop->mutable_indexing_config()->set_term_match_type(
+ prop->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
}
@@ -605,7 +605,7 @@ TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) {
prop->set_property_name("Foo");
prop->set_data_type(PropertyConfigProto::DataType::STRING);
prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- prop->mutable_indexing_config()->set_term_match_type(
+ prop->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
// Error if we don't set a tokenizer type
@@ -613,190 +613,11 @@ TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
// Passes once we set a tokenizer type
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ prop->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
}
-TEST_F(SchemaUtilTest, ValidateIntPropertyShouldntHaveIndexingConfig) {
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("MyType");
-
- auto* prop = type->add_properties();
- prop->set_property_name("IntProperty");
- prop->set_data_type(PropertyConfigProto::DataType::INT64);
- prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- // Passes if it doesn't have indexing config
- EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
-
- // Fails if we try to set an indexing_config.term_match_type
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing_config.tokenizer_type
- prop->mutable_indexing_config()->clear_term_match_type();
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing config
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(SchemaUtilTest, ValidateDoublePropertyShouldntHaveIndexingConfig) {
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("MyType");
-
- auto* prop = type->add_properties();
- prop->set_property_name("DoubleProperty");
- prop->set_data_type(PropertyConfigProto::DataType::DOUBLE);
- prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- // Passes if it doesn't have indexing config
- EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
-
- // Fails if we try to set an indexing_config.term_match_type
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing_config.tokenizer_type
- prop->mutable_indexing_config()->clear_term_match_type();
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing config
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(SchemaUtilTest, ValidateBooleanPropertyShouldntHaveIndexingConfig) {
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("MyType");
-
- auto* prop = type->add_properties();
- prop->set_property_name("BooleanProperty");
- prop->set_data_type(PropertyConfigProto::DataType::BOOLEAN);
- prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- // Passes if it doesn't have indexing config
- EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
-
- // Fails if we try to set an indexing_config.term_match_type
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing_config.tokenizer_type
- prop->mutable_indexing_config()->clear_term_match_type();
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing config
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(SchemaUtilTest, ValidateBytesPropertyShouldntHaveIndexingConfig) {
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("MyType");
-
- auto* prop = type->add_properties();
- prop->set_property_name("BytesProperty");
- prop->set_data_type(PropertyConfigProto::DataType::BYTES);
- prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- // Passes if it doesn't have indexing config
- EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
-
- // Fails if we try to set an indexing_config.term_match_type
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing_config.tokenizer_type
- prop->mutable_indexing_config()->clear_term_match_type();
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing config
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(SchemaUtilTest, ValidateDocumentPropertyShouldntHaveIndexingConfig) {
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("OtherType");
-
- type = schema.add_types();
- type->set_schema_type("MyType");
-
- auto* prop = type->add_properties();
- prop->set_property_name("SubType");
- prop->set_schema_type("OtherType");
- prop->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- // Passes if it doesn't have indexing config
- EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
-
- // Fails if we try to set an indexing_config.term_match_type
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing_config.tokenizer_type
- prop->mutable_indexing_config()->clear_term_match_type();
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- // Fails if we try to set an indexing config
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
} // namespace
} // namespace lib
diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc
index 0eed2fe..73aa947 100644
--- a/icing/schema/section-manager.cc
+++ b/icing/schema/section-manager.cc
@@ -119,24 +119,33 @@ libtextclassifier3::Status AssignSections(
for (const auto& property_config : sorted_properties) {
if (property_config.data_type() ==
PropertyConfigProto::DataType::DOCUMENT) {
- // Tries to find sections recursively
auto nested_type_config_iter =
type_config_map.find(property_config.schema_type());
if (nested_type_config_iter == type_config_map.end()) {
+ // This should never happen because our schema should already be
+ // validated by this point.
return absl_ports::NotFoundError(absl_ports::StrCat(
- "type config not found: ", property_config.schema_type()));
+ "Type config not found: ", property_config.schema_type()));
+ }
+
+ if (property_config.document_indexing_config()
+ .index_nested_properties()) {
+ // Assign any indexed sections recursively
+ const SchemaTypeConfigProto& nested_type_config =
+ nested_type_config_iter->second;
+ ICING_RETURN_IF_ERROR(
+ AssignSections(nested_type_config,
+ ConcatenatePath(current_section_path,
+ property_config.property_name()),
+ type_config_map, visited_states, metadata_list));
}
- const SchemaTypeConfigProto& nested_type_config =
- nested_type_config_iter->second;
- ICING_RETURN_IF_ERROR(
- AssignSections(nested_type_config,
- ConcatenatePath(current_section_path,
- property_config.property_name()),
- type_config_map, visited_states, metadata_list));
}
- if (property_config.indexing_config().term_match_type() ==
- TermMatchType::UNKNOWN) {
+ // Only index strings currently.
+ if (property_config.has_data_type() !=
+ PropertyConfigProto::DataType::STRING ||
+ property_config.string_indexing_config().term_match_type() ==
+ TermMatchType::UNKNOWN) {
// No need to create section for current property
continue;
}
@@ -155,8 +164,9 @@ libtextclassifier3::Status AssignSections(
}
// Creates section metadata from property config
metadata_list->emplace_back(
- new_section_id, property_config.indexing_config().term_match_type(),
- property_config.indexing_config().tokenizer_type(),
+ new_section_id,
+ property_config.string_indexing_config().term_match_type(),
+ property_config.string_indexing_config().tokenizer_type(),
ConcatenatePath(current_section_path, property_config.property_name()));
}
return libtextclassifier3::Status::OK;
@@ -199,16 +209,6 @@ std::vector<std::string> GetPropertyContent(const PropertyProto& property) {
if (!property.string_values().empty()) {
std::copy(property.string_values().begin(), property.string_values().end(),
std::back_inserter(values));
- } else if (!property.int64_values().empty()) {
- std::transform(
- property.int64_values().begin(), property.int64_values().end(),
- std::back_inserter(values),
- [](int64_t i) { return IcingStringUtil::StringPrintf("%" PRId64, i); });
- } else {
- std::transform(
- property.double_values().begin(), property.double_values().end(),
- std::back_inserter(values),
- [](double d) { return IcingStringUtil::StringPrintf("%f", d); });
}
return values;
}
@@ -264,9 +264,8 @@ SectionManager::GetSectionContent(const DocumentProto& document,
// Property name not found, it could be one of the following 2 cases:
// 1. The property is optional and it's not in the document
// 2. The property name is invalid
- return absl_ports::NotFoundError(
- absl_ports::StrCat("Section path ", section_path,
- " not found in type config ", document.schema()));
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Section path '", section_path, "' not found in document."));
}
if (separator_position == std::string::npos) {
@@ -275,9 +274,8 @@ SectionManager::GetSectionContent(const DocumentProto& document,
if (content.empty()) {
// The content of property is explicitly set to empty, we'll treat it as
// NOT_FOUND because the index doesn't care about empty strings.
- return absl_ports::NotFoundError(
- absl_ports::StrCat("Section path ", section_path,
- " not found in type config ", document.schema()));
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Section path '", section_path, "' content was empty"));
}
return content;
}
diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc
index ad9d07d..1a4d324 100644
--- a/icing/schema/section-manager_test.cc
+++ b/icing/schema/section-manager_test.cc
@@ -20,6 +20,7 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/proto/schema.proto.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/schema-util.h"
@@ -29,9 +30,12 @@
namespace icing {
namespace lib {
+
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
// type and property names of EmailMessage
constexpr char kTypeEmail[] = "EmailMessage";
@@ -93,16 +97,16 @@ class SectionManagerTest : public ::testing::Test {
subject->set_property_name(kPropertySubject);
subject->set_data_type(PropertyConfigProto::DataType::STRING);
subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- subject->mutable_indexing_config()->set_term_match_type(
+ subject->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- subject->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ subject->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
auto text = type.add_properties();
text->set_property_name(kPropertyText);
text->set_data_type(PropertyConfigProto::DataType::STRING);
text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- text->mutable_indexing_config()->set_term_match_type(
+ text->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::UNKNOWN);
auto attachment = type.add_properties();
@@ -114,10 +118,10 @@ class SectionManagerTest : public ::testing::Test {
recipients->set_property_name(kPropertyRecipients);
recipients->set_data_type(PropertyConfigProto::DataType::STRING);
recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- recipients->mutable_indexing_config()->set_term_match_type(
+ recipients->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- recipients->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ recipients->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
return type;
}
@@ -130,7 +134,7 @@ class SectionManagerTest : public ::testing::Test {
name->set_property_name(kPropertyName);
name->set_data_type(PropertyConfigProto::DataType::STRING);
name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- name->mutable_indexing_config()->set_term_match_type(
+ name->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
auto emails = type.add_properties();
@@ -138,6 +142,8 @@ class SectionManagerTest : public ::testing::Test {
emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
emails->set_schema_type(kTypeEmail);
+ emails->mutable_document_indexing_config()->set_index_nested_properties(
+ true);
return type;
}
@@ -166,6 +172,8 @@ TEST_F(SectionManagerTest, CreationWithSchemaInfiniteLoopShouldFail) {
property1->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
property1->set_schema_type("type2"); // Here we reference type2
property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property1->mutable_document_indexing_config()->set_index_nested_properties(
+ true);
SchemaTypeConfigProto type_config2;
type_config2.set_schema_type("type2");
@@ -175,6 +183,8 @@ TEST_F(SectionManagerTest, CreationWithSchemaInfiniteLoopShouldFail) {
// Here we reference type1, which references type2 causing the infinite loop
property2->set_schema_type("type1");
property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ property2->mutable_document_indexing_config()->set_index_nested_properties(
+ true);
SchemaUtil::TypeConfigMap type_config_map;
type_config_map.emplace("type1", type_config1);
@@ -194,11 +204,13 @@ TEST_F(SectionManagerTest, CreationWithSchemaSelfReferenceShouldFail) {
property1->set_property_name("property1");
property1->set_data_type(PropertyConfigProto::DataType::STRING);
property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property1->mutable_indexing_config()->set_term_match_type(
+ property1->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
auto property2 = type_config.add_properties();
property2->set_property_name("property2");
property2->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property2->mutable_document_indexing_config()->set_index_nested_properties(
+ true);
// Here we're referencing our own type, causing an infinite loop
property2->set_schema_type("type");
property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
@@ -222,7 +234,7 @@ TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) {
property->set_property_name("property" + std::to_string(i));
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
}
@@ -235,24 +247,6 @@ TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) {
HasSubstr("Too many properties")));
}
-TEST_F(SectionManagerTest, CreationWithUnknownSchemaTypeNameShouldFail) {
- SchemaTypeConfigProto type_config;
- type_config.set_schema_type("type");
- auto property = type_config.add_properties();
- property->set_property_name("property");
- property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- property->set_schema_type("unknown_name");
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- SchemaUtil::TypeConfigMap type_config_map;
- type_config_map.emplace("type", type_config);
-
- EXPECT_THAT(
- SectionManager::Create(type_config_map, schema_type_mapper_.get()),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
- HasSubstr("type config not found")));
-}
-
TEST_F(SectionManagerTest, GetSectionContent) {
ICING_ASSERT_OK_AND_ASSIGN(
auto section_manager,
@@ -393,5 +387,261 @@ TEST_F(SectionManagerTest, ExtractSections) {
EXPECT_THAT(sections[1].content, ElementsAre("the subject", "the subject"));
}
+TEST_F(SectionManagerTest,
+ NonStringFieldsWithStringIndexingConfigDontCreateSections) {
+ // Create a schema for an empty document.
+ SchemaTypeConfigProto empty_type;
+ empty_type.set_schema_type("EmptySchema");
+
+ // Create a schema with all the non-string fields
+ SchemaTypeConfigProto type_with_non_string_properties;
+ type_with_non_string_properties.set_schema_type("Schema");
+
+ // Create an int property with a string_indexing_config
+ auto int_property = type_with_non_string_properties.add_properties();
+ int_property->set_property_name("int");
+ int_property->set_data_type(PropertyConfigProto::DataType::INT64);
+ int_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ int_property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ int_property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ // Create a double property with a string_indexing_config
+ auto double_property = type_with_non_string_properties.add_properties();
+ double_property->set_property_name("double");
+ double_property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
+ double_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ double_property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ double_property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ // Create a boolean property with a string_indexing_config
+ auto boolean_property = type_with_non_string_properties.add_properties();
+ boolean_property->set_property_name("boolean");
+ boolean_property->set_data_type(PropertyConfigProto::DataType::BOOLEAN);
+ boolean_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ boolean_property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ boolean_property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ // Create a bytes property with a string_indexing_config
+ auto bytes_property = type_with_non_string_properties.add_properties();
+ bytes_property->set_property_name("bytes");
+ bytes_property->set_data_type(PropertyConfigProto::DataType::BYTES);
+ bytes_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ bytes_property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ bytes_property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ // Create a document property with a string_indexing_config
+ auto document_property = type_with_non_string_properties.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ document_property->set_schema_type(empty_type.schema_type());
+ document_property->set_cardinality(
+ PropertyConfigProto::Cardinality::REQUIRED);
+ document_property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ document_property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ // Setup classes to create the section manager
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(type_with_non_string_properties.schema_type(),
+ type_with_non_string_properties);
+ type_config_map.emplace(empty_type.schema_type(), empty_type);
+
+ // KeyMapper uses 3 internal arrays for bookkeeping. Give each one 128KiB so
+ // the total KeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ std::string dir = GetTestTempDir() + "/non_string_fields";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ KeyMapper<SchemaTypeId>::Create(filesystem_, dir, key_mapper_size));
+ ICING_ASSERT_OK(schema_type_mapper->Put(
+ type_with_non_string_properties.schema_type(), /*schema_type_id=*/0));
+ ICING_ASSERT_OK(schema_type_mapper->Put(empty_type.schema_type(),
+ /*schema_type_id=*/1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map, schema_type_mapper.get()));
+
+ // Create an empty document to be nested
+ DocumentProto empty_document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(empty_type.schema_type())
+ .Build();
+
+ // Create a document that follows "Schema"
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(type_with_non_string_properties.schema_type())
+ .AddInt64Property("int", 1)
+ .AddDoubleProperty("double", 0.2)
+ .AddBooleanProperty("boolean", true)
+ .AddBytesProperty("bytes", "attachment bytes")
+ .AddDocumentProperty("document", empty_document)
+ .Build();
+
+ // Extracts sections from 'Schema' document
+ ICING_ASSERT_OK_AND_ASSIGN(auto sections,
+ section_manager->ExtractSections(document));
+ EXPECT_THAT(sections.size(), Eq(0));
+}
+
+TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) {
+ // Create the inner schema that the document property is.
+ SchemaTypeConfigProto document_type;
+ document_type.set_schema_type("DocumentSchema");
+
+ auto string_property = document_type.add_properties();
+ string_property->set_property_name("string");
+ string_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ string_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ string_property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ string_property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ // Create the outer schema which has the document property.
+ SchemaTypeConfigProto type;
+ type.set_schema_type("Schema");
+
+ auto document_property = type.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ document_property->set_schema_type(document_type.schema_type());
+ document_property->set_cardinality(
+ PropertyConfigProto::Cardinality::REQUIRED);
+
+ // Opt into recursing into the document fields.
+ document_property->mutable_document_indexing_config()
+ ->set_index_nested_properties(true);
+
+ // Create the inner document.
+ DocumentProto inner_document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(document_type.schema_type())
+ .AddStringProperty("string", "foo")
+ .Build();
+
+ // Create the outer document that holds the inner document
+ DocumentProto outer_document =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(type.schema_type())
+ .AddDocumentProperty("document", inner_document)
+ .Build();
+
+ // Setup classes to create the section manager
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(type.schema_type(), type);
+ type_config_map.emplace(document_type.schema_type(), document_type);
+
+ // KeyMapper uses 3 internal arrays for bookkeeping. Give each one 128KiB so
+ // the total KeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ std::string dir = GetTestTempDir() + "/recurse_into_document";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ KeyMapper<SchemaTypeId>::Create(filesystem_, dir, key_mapper_size));
+ int type_schema_type_id = 0;
+ int document_type_schema_type_id = 1;
+ ICING_ASSERT_OK(
+ schema_type_mapper->Put(type.schema_type(), type_schema_type_id));
+ ICING_ASSERT_OK(schema_type_mapper->Put(document_type.schema_type(),
+ document_type_schema_type_id));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map, schema_type_mapper.get()));
+
+ // Extracts sections from 'Schema' document; there should be the 1 string
+ // property inside the document.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
+ section_manager->ExtractSections(outer_document));
+ EXPECT_THAT(sections, SizeIs(1));
+}
+
+TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) {
+ // Create the inner schema that the document property is.
+ SchemaTypeConfigProto document_type;
+ document_type.set_schema_type("DocumentSchema");
+
+ auto string_property = document_type.add_properties();
+ string_property->set_property_name("string");
+ string_property->set_data_type(PropertyConfigProto::DataType::STRING);
+ string_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ string_property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ string_property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ // Create the outer schema which has the document property.
+ SchemaTypeConfigProto type;
+ type.set_schema_type("Schema");
+
+ auto document_property = type.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ document_property->set_schema_type(document_type.schema_type());
+ document_property->set_cardinality(
+ PropertyConfigProto::Cardinality::REQUIRED);
+
+ // Opt into recursing into the document fields.
+ document_property->mutable_document_indexing_config()
+ ->set_index_nested_properties(false);
+
+ // Create the inner document.
+ DocumentProto inner_document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(document_type.schema_type())
+ .AddStringProperty("string", "foo")
+ .Build();
+
+ // Create the outer document that holds the inner document
+ DocumentProto outer_document =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(type.schema_type())
+ .AddDocumentProperty("document", inner_document)
+ .Build();
+
+ // Setup classes to create the section manager
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(type.schema_type(), type);
+ type_config_map.emplace(document_type.schema_type(), document_type);
+
+ // KeyMapper uses 3 internal arrays for bookkeeping. Give each one 128KiB so
+ // the total KeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ std::string dir = GetTestTempDir() + "/recurse_into_document";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ KeyMapper<SchemaTypeId>::Create(filesystem_, dir, key_mapper_size));
+ int type_schema_type_id = 0;
+ int document_type_schema_type_id = 1;
+ ICING_ASSERT_OK(
+ schema_type_mapper->Put(type.schema_type(), type_schema_type_id));
+ ICING_ASSERT_OK(schema_type_mapper->Put(document_type.schema_type(),
+ document_type_schema_type_id));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto section_manager,
+ SectionManager::Create(type_config_map, schema_type_mapper.get()));
+
+ // Extracts sections from 'Schema' document; there won't be any since we
+ // didn't recurse into the document to see the inner string property
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
+ section_manager->ExtractSections(outer_document));
+ EXPECT_THAT(sections, IsEmpty());
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/schema/section.h b/icing/schema/section.h
index daf4fd0..7669c97 100644
--- a/icing/schema/section.h
+++ b/icing/schema/section.h
@@ -54,9 +54,9 @@ struct SectionMetadata {
// A unique id of property within a type config
SectionId id;
- // How content in this section should be tokenized. It is invalid for a
- // section to have tokenizer == 'NONE'.
- IndexingConfig::TokenizerType::Code tokenizer;
+ // How strings should be tokenized. It is invalid for a section to have
+ // tokenizer == 'NONE'.
+ StringIndexingConfig::TokenizerType::Code tokenizer;
// How tokens in this section should be matched.
//
@@ -71,7 +71,7 @@ struct SectionMetadata {
TermMatchType::Code term_match_type = TermMatchType::UNKNOWN;
SectionMetadata(SectionId id_in, TermMatchType::Code term_match_type_in,
- IndexingConfig::TokenizerType::Code tokenizer,
+ StringIndexingConfig::TokenizerType::Code tokenizer,
std::string&& path_in)
: path(std::move(path_in)),
id(id_in),
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 1e47d59..8ddde14 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -35,6 +35,7 @@
#include "icing/legacy/core/icing-string-util.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/schema/schema-store.h"
#include "icing/store/document-associated-score-data.h"
#include "icing/store/document-filter-data.h"
@@ -45,6 +46,7 @@
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include "icing/util/timer.h"
namespace icing {
namespace lib {
@@ -185,8 +187,8 @@ DocumentStore::DocumentStore(const Filesystem* filesystem,
document_validator_(schema_store) {}
libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
- const DocumentProto& document) {
- return Put(DocumentProto(document));
+ const DocumentProto& document, NativePutDocumentStats* put_document_stats) {
+ return Put(DocumentProto(document), put_document_stats);
}
DocumentStore::~DocumentStore() {
@@ -200,18 +202,20 @@ DocumentStore::~DocumentStore() {
libtextclassifier3::StatusOr<std::unique_ptr<DocumentStore>>
DocumentStore::Create(const Filesystem* filesystem, const std::string& base_dir,
- const Clock* clock, const SchemaStore* schema_store) {
+ const Clock* clock, const SchemaStore* schema_store,
+ NativeInitializeStats* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(filesystem);
ICING_RETURN_ERROR_IF_NULL(clock);
ICING_RETURN_ERROR_IF_NULL(schema_store);
auto document_store = std::unique_ptr<DocumentStore>(
new DocumentStore(filesystem, base_dir, clock, schema_store));
- ICING_RETURN_IF_ERROR(document_store->Initialize());
+ ICING_RETURN_IF_ERROR(document_store->Initialize(initialize_stats));
return document_store;
}
-libtextclassifier3::Status DocumentStore::Initialize() {
+libtextclassifier3::Status DocumentStore::Initialize(
+ NativeInitializeStats* initialize_stats) {
auto create_result_or = FileBackedProtoLog<DocumentWrapper>::Create(
filesystem_, MakeDocumentLogFilename(base_dir_),
FileBackedProtoLog<DocumentWrapper>::Options(
@@ -227,10 +231,30 @@ libtextclassifier3::Status DocumentStore::Initialize() {
std::move(create_result_or).ValueOrDie();
document_log_ = std::move(create_result.proto_log);
- if (create_result.data_loss) {
+ if (create_result.has_data_loss()) {
ICING_LOG(WARNING)
<< "Data loss in document log, regenerating derived files.";
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_document_store_recovery_cause(
+ NativeInitializeStats::DATA_LOSS);
+
+ if (create_result.data_status ==
+ FileBackedProtoLog<DocumentWrapper>::CreateResult::PARTIAL_LOSS) {
+ // Ground truth is partially lost.
+ initialize_stats->set_document_store_data_status(
+ NativeInitializeStats::PARTIAL_LOSS);
+ } else {
+ // Ground truth is completely lost.
+ initialize_stats->set_document_store_data_status(
+ NativeInitializeStats::COMPLETE_LOSS);
+ }
+ }
+ Timer document_recovery_timer;
libtextclassifier3::Status status = RegenerateDerivedFiles();
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_document_store_recovery_latency_ms(
+ document_recovery_timer.GetElapsedMilliseconds());
+ }
if (!status.ok()) {
ICING_LOG(ERROR)
<< "Failed to regenerate derived files for DocumentStore";
@@ -241,7 +265,16 @@ libtextclassifier3::Status DocumentStore::Initialize() {
ICING_VLOG(1)
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for DocumentStore.";
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_document_store_recovery_cause(
+ NativeInitializeStats::IO_ERROR);
+ }
+ Timer document_recovery_timer;
libtextclassifier3::Status status = RegenerateDerivedFiles();
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_document_store_recovery_latency_ms(
+ document_recovery_timer.GetElapsedMilliseconds());
+ }
if (!status.ok()) {
ICING_LOG(ERROR)
<< "Failed to regenerate derived files for DocumentStore";
@@ -251,6 +284,9 @@ libtextclassifier3::Status DocumentStore::Initialize() {
}
initialized_ = true;
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_num_documents(document_id_mapper_->num_elements());
+ }
return libtextclassifier3::Status::OK;
}
@@ -689,9 +725,14 @@ libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
}
libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
- DocumentProto&& document) {
+ DocumentProto&& document, NativePutDocumentStats* put_document_stats) {
+ Timer put_timer;
ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_document_size(document.ByteSizeLong());
+ }
+
// Copy fields needed before they are moved
std::string name_space = document.namespace_();
std::string uri = document.uri();
@@ -765,6 +806,11 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
}
}
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_document_store_latency_ms(
+ put_timer.GetElapsedMilliseconds());
+ }
+
return new_document_id;
}
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 5c1b902..d6ffbaa 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -28,6 +28,7 @@
#include "icing/file/filesystem.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/schema/schema-store.h"
#include "icing/store/document-associated-score-data.h"
#include "icing/store/document-filter-data.h"
@@ -81,8 +82,11 @@ class DocumentStore {
// previously initialized with this directory, it will reload the files saved
// by the last instance.
//
- // Does not take any ownership, and all pointers must refer to valid objects
- // that outlive the one constructed.
+ // If initialize_stats is present, the fields related to DocumentStore will be
+ // populated.
+ //
+ // Does not take any ownership, and all pointers except initialize_stats must
+ // refer to valid objects that outlive the one constructed.
//
// TODO(cassiewang): Consider returning a status indicating that derived files
// were regenerated. This may be helpful in logs.
@@ -93,29 +97,41 @@ class DocumentStore {
// INTERNAL_ERROR on IO error
static libtextclassifier3::StatusOr<std::unique_ptr<DocumentStore>> Create(
const Filesystem* filesystem, const std::string& base_dir,
- const Clock* clock, const SchemaStore* schema_store);
+ const Clock* clock, const SchemaStore* schema_store,
+ NativeInitializeStats* initialize_stats = nullptr);
// Returns the maximum DocumentId that the DocumentStore has assigned. If
// there has not been any DocumentIds assigned, i.e. the DocumentStore is
// empty, then kInvalidDocumentId is returned. This does not filter out
- // DocumentIds of deleted documents.
- const DocumentId last_added_document_id() const {
+ // DocumentIds of deleted or expired documents.
+ DocumentId last_added_document_id() const {
if (document_id_mapper_->num_elements() == 0) {
return kInvalidDocumentId;
}
return document_id_mapper_->num_elements() - 1;
}
+ // Returns the number of documents. The result does not filter out DocumentIds
+ // of deleted or expired documents.
+ int num_documents() const { return document_id_mapper_->num_elements(); }
+
// Puts the document into document store.
//
+ // If put_document_stats is present, the fields related to DocumentStore will
+ // be populated.
+ //
// Returns:
// A newly generated document id on success
// FAILED_PRECONDITION if schema hasn't been set yet
// NOT_FOUND if the schema_type or a property config of the document doesn't
// exist in schema
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<DocumentId> Put(const DocumentProto& document);
- libtextclassifier3::StatusOr<DocumentId> Put(DocumentProto&& document);
+ libtextclassifier3::StatusOr<DocumentId> Put(
+ const DocumentProto& document,
+ NativePutDocumentStats* put_document_stats = nullptr);
+ libtextclassifier3::StatusOr<DocumentId> Put(
+ DocumentProto&& document,
+ NativePutDocumentStats* put_document_stats = nullptr);
// Finds and returns the document identified by the given key (namespace +
// uri)
@@ -422,7 +438,8 @@ class DocumentStore {
// worry about this field.
bool initialized_ = false;
- libtextclassifier3::Status Initialize();
+ libtextclassifier3::Status Initialize(
+ NativeInitializeStats* initialize_stats);
// Creates sub-components and verifies the integrity of each sub-component.
//
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 301dbdd..d97ec46 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -111,19 +111,19 @@ class DocumentStoreTest : public ::testing::Test {
subject->set_property_name("subject");
subject->set_data_type(PropertyConfigProto::DataType::STRING);
subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- subject->mutable_indexing_config()->set_term_match_type(
+ subject->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- subject->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ subject->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
auto body = type_config->add_properties();
body->set_property_name("body");
body->set_data_type(PropertyConfigProto::DataType::STRING);
body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- body->mutable_indexing_config()->set_term_match_type(
+ body->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
ICING_ASSERT_OK_AND_ASSIGN(
schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_));
@@ -1941,10 +1941,10 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) {
property_config->set_property_name("subject");
property_config->set_data_type(PropertyConfigProto::DataType::STRING);
property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property_config->mutable_indexing_config()->set_term_match_type(
+ property_config->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property_config->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
@@ -2168,10 +2168,10 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) {
property_config->set_property_name("subject");
property_config->set_data_type(PropertyConfigProto::DataType::STRING);
property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property_config->mutable_indexing_config()->set_term_match_type(
+ property_config->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property_config->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h
index 7e14d0a..31d41fc 100644
--- a/icing/testing/common-matchers.h
+++ b/icing/testing/common-matchers.h
@@ -32,10 +32,11 @@ namespace lib {
// Used to match Token(Token::Type type, std::string_view text)
MATCHER_P2(EqualsToken, type, text, "") {
+ std::string arg_string(arg.text.data(), arg.text.length());
if (arg.type != type || arg.text != text) {
*result_listener << IcingStringUtil::StringPrintf(
"(Expected: type=%d, text=\"%s\". Actual: type=%d, text=\"%s\")", type,
- &text[0], arg.type, arg.text.data());
+ text, arg.type, arg_string.c_str());
return false;
}
return true;
diff --git a/icing/testing/schema-generator.h b/icing/testing/schema-generator.h
index e733612..863f43f 100644
--- a/icing/testing/schema-generator.h
+++ b/icing/testing/schema-generator.h
@@ -31,9 +31,11 @@ class ExactStringPropertyGenerator {
prop.set_property_name(name.data(), name.length());
prop.set_data_type(PropertyConfigProto::DataType::STRING);
prop.set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- IndexingConfig* indexing_config = prop.mutable_indexing_config();
- indexing_config->set_term_match_type(TermMatchType::EXACT_ONLY);
- indexing_config->set_tokenizer_type(IndexingConfig::TokenizerType::PLAIN);
+ StringIndexingConfig* string_indexing_config =
+ prop.mutable_string_indexing_config();
+ string_indexing_config->set_term_match_type(TermMatchType::EXACT_ONLY);
+ string_indexing_config->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
return prop;
}
};
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index d43a78d..74d22cd 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -169,7 +169,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Returns true on success
bool Initialize() {
UErrorCode status = U_ZERO_ERROR;
- utext_openUTF8(&u_text_, text_.data(), /*length=*/-1, &status);
+ utext_openUTF8(&u_text_, text_.data(), text_.length(), &status);
break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
/*textLength=*/0, &status);
ubrk_setUText(break_iterator_, &u_text_, &status);
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index d9db75a..df0981b 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -43,10 +43,10 @@ class PlainTokenizerTest : public ::testing::Test {
};
TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, /*lang_segmenter=*/nullptr),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ /*lang_segmenter=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
TEST_F(PlainTokenizerTest, Simple) {
@@ -54,10 +54,10 @@ TEST_F(PlainTokenizerTest, Simple) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
@@ -88,10 +88,10 @@ TEST_F(PlainTokenizerTest, Whitespace) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
// There're many unicode characters that are whitespaces, here we choose tabs
// to represent others.
@@ -116,10 +116,10 @@ TEST_F(PlainTokenizerTest, Punctuation) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
// Half-width punctuation marks are filtered out.
EXPECT_THAT(plain_tokenizer->TokenizeAll(
@@ -147,10 +147,10 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
// Right now we don't have special logic for these characters, just output
// them as tokens.
@@ -170,10 +170,10 @@ TEST_F(PlainTokenizerTest, CJKT) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
// In plain tokenizer, CJKT characters are handled the same way as non-CJKT
// characters, just add these tests as sanity checks.
@@ -224,10 +224,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
constexpr std::string_view kText = "f b";
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
@@ -243,10 +243,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
constexpr std::string_view kText = "f b";
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
@@ -262,10 +262,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
@@ -312,10 +312,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 8b2edc9..50b25c5 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -26,6 +26,9 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section-manager.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer.h"
@@ -70,7 +73,7 @@ constexpr char kColon = ':';
constexpr char kLeftParentheses = '(';
constexpr char kRightParentheses = ')';
constexpr char kExclusion = '-';
-constexpr char kOrOperator[] = "OR";
+constexpr std::string_view kOrOperator = "OR";
enum State {
// Ready to process any terms
@@ -100,10 +103,14 @@ enum State {
// When seeing right parentheses
CLOSING_PARENTHESES = 8,
+ PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9,
+
+ PROCESSING_PROPERTY_TERM_APPENDING = 10,
+
// Valid state count
- STATE_COUNT = 9,
+ STATE_COUNT = 11,
- INVALID = 10
+ INVALID = 12
};
enum TermType {
@@ -111,27 +118,29 @@ enum TermType {
WHITESPACE = 0,
// A term that consists of unicode alphabetic and numeric characters
- ALPHANUMERIC_TERM = 1,
+ ASCII_ALPHANUMERIC_TERM = 1,
+
+ NON_ASCII_ALPHABETIC_TERM = 2,
// "("
- LEFT_PARENTHESES = 2,
+ LEFT_PARENTHESES = 3,
// ")"
- RIGHT_PARENTHESES = 3,
+ RIGHT_PARENTHESES = 4,
// "-"
- EXCLUSION_OPERATOR = 4,
+ EXCLUSION_OPERATOR = 5,
// "OR"
- OR_OPERATOR = 5,
+ OR_OPERATOR = 6,
// ":"
- COLON = 6,
+ COLON = 7,
// All the other characters seen that are not the types above
- OTHER = 7,
+ OTHER = 8,
- TYPE_COUNT = 8
+ TYPE_COUNT = 9
};
enum ActionOrError {
@@ -145,6 +154,9 @@ enum ActionOrError {
// Ignore / throw away the current term
IGNORE = 2,
+ // Concatenate with next term
+ CONCATENATE = 3,
+
// Errors
ERROR_UNKNOWN = 100,
ERROR_NO_WHITESPACE_AROUND_OR = 101,
@@ -154,6 +166,7 @@ enum ActionOrError {
ERROR_EXCLUSION_PROPERTY_TOGETHER = 105,
ERROR_EXCLUSION_OR_TOGETHER = 106,
ERROR_PROPERTY_OR_TOGETHER = 107,
+ ERROR_NON_ASCII_AS_PROPERTY_NAME = 108,
};
std::string_view GetErrorMessage(ActionOrError maybe_error) {
@@ -175,6 +188,8 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
return "Exclusion and OR operators can't be used together";
case ERROR_PROPERTY_OR_TOGETHER:
return "Property restriction and OR operators can't be used together";
+ case ERROR_NON_ASCII_AS_PROPERTY_NAME:
+ return "Characters in property name must all be ASCII.";
default:
return "";
}
@@ -186,7 +201,7 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// States:
//
// READY = 0
-// PROCESSING_ALPHANUMERIC_TERM = 1
+// PROCESSING_ASCII_ALPHANUMERIC_TERM = 1
// PROCESSING_EXCLUSION = 2
// PROCESSING_EXCLUSION_TERM = 3
// PROCESSING_PROPERTY_RESTRICT = 4
@@ -194,24 +209,28 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// PROCESSING_OR = 6
// OPENING_PARENTHESES = 7
// CLOSING_PARENTHESES = 8
+// PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9
+// PROCESSING_PROPERTY_TERM_APPENDING = 10
//
// Actions:
//
// OUTPUT = a
// KEEP = b
// IGNORE = c
+// CONCAT = d, concatenate the current term and the new term.
//
-// ========================================================
-// Transition Table || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
-// ===========================================================================
-// WHITESPACE || 0,c | 0,a | 0,c | 0,a | 0,a | 0,a | 0,a | 0,a | 0,a |
-// ALPHANUMERIC_TERM || 1,c | 1,a | 3,a | 1,a | 5,a | 1,a |ERROR| 1,a | 1,a |
-// LEFT_PARENTHESES || 7,c | 7,a |ERROR| 7,a |ERROR| 7,a | 7,a | 7,a | 7,a |
-// RIGHT_PARENTHESES || 8,c | 8,a | 8,c | 8,a | 8,a | 8,a | 8,c | 8,a | 8,a |
-// EXCLUSION_OPERATOR || 2,c | 0,a | 2,c | 0,a |ERROR| 0,a |ERROR| 2,a | 2,a |
-// OR_OPERATOR || 6,c |ERROR|ERROR|ERROR|ERROR|ERROR|ERROR| 7,b | 6,a |
-// COLON || 0,c | 4,b |ERROR|ERROR| 4,b | 0,a |ERROR| 0,a |ERROR|
-// OTHER || 0,c | 0,a | 0,c | 0,a | 0,a | 0,a | 0,a | 0,a | 0,a |
+// =============================================================================
+// Transition || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
+// =============================================================================
+// WHITESPACE || 0,c| 0,a| 0,c| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a|
+// ASCII_ALPHA || 1,c| 1,d| 3,a| 1,a| 5,a| 1,a|ERR | 1,a| 1,a| 1,a|10,d|
+// NONASCII_ALPHA || 9,c| 9,a| 3,a| 9,a| 5,a| 9,a|ERR | 9,a| 9,a| 9,a|10,d|
+// LEFT_PAREN || 7,c| 7,a|ERR | 7,a|ERR | 7,a| 7,a| 7,a| 7,a| 7,a| 7,a|
+// RIGHT_PAREN || 8,c| 8,a| 8,c| 8,a| 8,a| 8,a| 8,c| 8,a| 8,a| 8,a| 8,a|
+// EXCLUSION_OP || 2,c| 0,a| 2,c| 0,a|ERR | 0,a|ERR | 2,a| 2,a| 0,a| 0,a|
+// OR_OPERATOR || 6,c|ERR |ERR |ERR |ERR |ERR |ERR | 7,b| 6,a|ERR |ERR |
+// COLON || 0,c| 4,b|ERR |ERR | 4,b|10,d|ERR | 0,a|ERR |ERR |10,d|
+// OTHER || 0,c| 0,a| 0,c| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a|
//
// Each cell is a rule that consists of 4 things:
// [current state] + [next term type] -> [new state] + [action]
@@ -234,33 +253,46 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// like "+", "&", "@", "#" in indexing and query tokenizers.
constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: Ready*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
- CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, READY, READY},
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
+ PROCESSING_OR, READY, READY},
/*State: PROCESSING_ALPHANUMERIC_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
- CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_RESTRICT, READY},
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID,
+ PROCESSING_PROPERTY_RESTRICT, READY},
/*State: PROCESSING_EXCLUSION*/
- {READY, PROCESSING_EXCLUSION_TERM, INVALID, CLOSING_PARENTHESES,
- PROCESSING_EXCLUSION, INVALID, INVALID, READY},
+ {READY, PROCESSING_EXCLUSION_TERM, PROCESSING_EXCLUSION_TERM, INVALID,
+ CLOSING_PARENTHESES, PROCESSING_EXCLUSION, INVALID, INVALID, READY},
/*State: PROCESSING_EXCLUSION_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
- CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
/*State: PROCESSING_PROPERTY_RESTRICT*/
- {READY, PROCESSING_PROPERTY_TERM, INVALID, CLOSING_PARENTHESES, INVALID,
- INVALID, PROCESSING_PROPERTY_RESTRICT, READY},
+ {READY, PROCESSING_PROPERTY_TERM, PROCESSING_PROPERTY_TERM, INVALID,
+ CLOSING_PARENTHESES, INVALID, INVALID, PROCESSING_PROPERTY_RESTRICT,
+ READY},
/*State: PROCESSING_PROPERTY_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
- CLOSING_PARENTHESES, READY, INVALID, READY, READY},
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID,
+ PROCESSING_PROPERTY_TERM_APPENDING, READY},
/*State: PROCESSING_OR*/
- {READY, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID, INVALID,
- INVALID, READY},
+ {READY, INVALID, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID,
+ INVALID, INVALID, READY},
/*State: OPENING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
- CLOSING_PARENTHESES, PROCESSING_EXCLUSION, OPENING_PARENTHESES, READY,
- READY},
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
+ OPENING_PARENTHESES, READY, READY},
/*State: CLOSING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
- CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, INVALID, READY}};
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
+ PROCESSING_OR, INVALID, READY},
+ /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
+ /*State: PROCESSING_PROPERTY_TERM_APPENDING*/
+ {READY, PROCESSING_PROPERTY_TERM_APPENDING,
+ PROCESSING_PROPERTY_TERM_APPENDING, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_TERM_APPENDING,
+ READY}};
// We use a 2D array to encode the action rules,
// The value of action_rules[state1][term_type1] means "what action we need to
@@ -269,62 +301,121 @@ constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = {
// NOTE: Please update the state transition table above if this is updated.
constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: Ready*/
- {IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE},
+ {IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE},
/*State: PROCESSING_ALPHANUMERIC_TERM*/
- {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
- KEEP, OUTPUT},
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_NO_WHITESPACE_AROUND_OR, KEEP, OUTPUT},
/*State: PROCESSING_EXCLUSION*/
- {IGNORE, OUTPUT, ERROR_GROUP_AFTER_EXCLUSION, IGNORE, IGNORE,
+ {IGNORE, OUTPUT, OUTPUT, ERROR_GROUP_AFTER_EXCLUSION, IGNORE, IGNORE,
ERROR_EXCLUSION_OR_TOGETHER, ERROR_EXCLUSION_PROPERTY_TOGETHER, IGNORE},
/*State: PROCESSING_EXCLUSION_TERM*/
- {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
- ERROR_EXCLUSION_PROPERTY_TOGETHER, OUTPUT},
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_NO_WHITESPACE_AROUND_OR, ERROR_EXCLUSION_PROPERTY_TOGETHER, OUTPUT},
/*State: PROCESSING_PROPERTY_RESTRICT*/
- {OUTPUT, OUTPUT, ERROR_GROUP_AFTER_PROPERTY_RESTRICTION, OUTPUT,
+ {OUTPUT, OUTPUT, OUTPUT, ERROR_GROUP_AFTER_PROPERTY_RESTRICTION, OUTPUT,
ERROR_EXCLUSION_PROPERTY_TOGETHER, ERROR_PROPERTY_OR_TOGETHER, KEEP,
OUTPUT},
/*State: PROCESSING_PROPERTY_TERM*/
- {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
- OUTPUT, OUTPUT},
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_NO_WHITESPACE_AROUND_OR, CONCATENATE, OUTPUT},
/*State: PROCESSING_OR*/
- {OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT, IGNORE,
- ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR,
- ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT},
+ {OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR,
+ OUTPUT, IGNORE, ERROR_NO_WHITESPACE_AROUND_OR,
+ ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT},
/*State: OPENING_PARENTHESES*/
- {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, KEEP, OUTPUT, OUTPUT},
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, KEEP, OUTPUT, OUTPUT},
/*State: CLOSING_PARENTHESES*/
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT},
+ /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/
{OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
- ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT}};
-
-// Helper function to get the TermType of the input term.
-TermType GetTermType(std::string_view term) {
- if (term.length() == 1) {
- // Must be an ASCII char
- const char& first_term_char = term[0];
- if (first_term_char == kWhitespace) {
- return WHITESPACE;
- } else if (first_term_char == kColon) {
- return COLON;
- } else if (first_term_char == kLeftParentheses) {
- return LEFT_PARENTHESES;
- } else if (first_term_char == kRightParentheses) {
- return RIGHT_PARENTHESES;
- } else if (first_term_char == kExclusion) {
- return EXCLUSION_OPERATOR;
- }
- } else if (term.length() == 2 && term == kOrOperator) {
- return OR_OPERATOR;
+ ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NON_ASCII_AS_PROPERTY_NAME, OUTPUT},
+ /*State: PROCESSING_PROPERTY_TERM_APPENDING*/
+ {OUTPUT, CONCATENATE, CONCATENATE, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_NO_WHITESPACE_AROUND_OR, CONCATENATE, OUTPUT}};
+
+// Determines the length of the whitespace term beginning at text[pos] and
+// returns a pair with the WHITESPACE TermType and a string_view of the
+// whitespace term.
+std::pair<TermType, std::string_view> GetWhitespaceTerm(std::string_view text,
+ size_t pos) {
+ size_t cur = pos;
+ while (cur < text.length() && text[cur] == kWhitespace) {
+ ++cur;
}
+ return std::make_pair(WHITESPACE, text.substr(pos, cur - pos));
+}
+
+// Determines the length of the potential content term beginning at text[pos]
+// and returns a pair with the appropriate TermType and a string_view of the
+// content term.
+//
+// NOTE: The potential content term could multiple content terms (segmentation
+// is needed to determine this), a property restrict (depending on other
+// neighboring tokens). It could also be multiple content terms surrounding an
+// OR operator (segmentation is also needed to determine this).
+std::pair<TermType, std::string_view> GetContentTerm(std::string_view text,
+ size_t pos) {
+ size_t len = 0;
// Checks the first char to see if it's an ASCII term
- if (i18n_utils::IsAscii(term[0])) {
- if (std::isalnum(term[0])) {
- return ALPHANUMERIC_TERM;
+ TermType type = ASCII_ALPHANUMERIC_TERM;
+ if (!i18n_utils::IsAscii(text[pos])) {
+ type = NON_ASCII_ALPHABETIC_TERM;
+ } else if (std::isalnum(text[pos])) {
+ type = OTHER;
+ }
+ for (size_t cur = pos; cur < text.length() && len == 0; ++cur) {
+ switch (text[cur]) {
+ case kLeftParentheses:
+ [[fallthrough]];
+ case kRightParentheses:
+ [[fallthrough]];
+ case kExclusion:
+ [[fallthrough]];
+ case kWhitespace:
+ [[fallthrough]];
+ case kColon:
+ // If we reach any of our special characters (colon, exclusion or
+ // parentheses), then we've reached the end of the content term. Set len
+ // and exit the loop.
+ len = cur - pos;
+ break;
+ default:
+ break;
}
- return OTHER;
}
- // All non-ASCII terms are alphabetic since language segmenter already
- // filters out non-ASCII and non-alphabetic terms
- return ALPHANUMERIC_TERM;
+ if (len == 0) {
+ // If len isn't set, then we must have reached the end of the string.
+ len = text.length() - pos;
+ }
+ return std::make_pair(type, text.substr(pos, len));
+}
+
+// Determines the type and length of the term beginning at text[pos].
+std::pair<TermType, std::string_view> GetTerm(std::string_view text,
+ size_t pos) {
+ switch (text[pos]) {
+ case kLeftParentheses:
+ return std::make_pair(LEFT_PARENTHESES, text.substr(pos, 1));
+ case kRightParentheses:
+ return std::make_pair(RIGHT_PARENTHESES, text.substr(pos, 1));
+ case kExclusion:
+ return std::make_pair(EXCLUSION_OPERATOR, text.substr(pos, 1));
+ case kWhitespace:
+ // Get length of whitespace
+ return GetWhitespaceTerm(text, pos);
+ case kColon:
+ return std::make_pair(COLON, text.substr(pos, 1));
+ case kOrOperator[0]:
+ if (text.length() >= pos + kOrOperator.length() &&
+ text.substr(pos, kOrOperator.length()) == kOrOperator) {
+ return std::make_pair(OR_OPERATOR,
+ text.substr(pos, kOrOperator.length()));
+ }
+ [[fallthrough]];
+ default:
+ return GetContentTerm(text, pos);
+ }
}
// Helper function to remove the last token if it's OR operator. This is used to
@@ -378,12 +469,18 @@ libtextclassifier3::Status OutputToken(State new_state,
TermType current_term_type,
std::vector<Token>* tokens) {
switch (current_term_type) {
- case ALPHANUMERIC_TERM:
+ case ASCII_ALPHANUMERIC_TERM:
+ [[fallthrough]];
+ case NON_ASCII_ALPHABETIC_TERM:
if (new_state == PROCESSING_PROPERTY_TERM) {
- // Asserts extra rule 1: property name must be in ASCII
- if (!i18n_utils::IsAscii(current_term[0])) {
- return absl_ports::InvalidArgumentError(
- "Characters in property name must all be ASCII.");
+ // Asserts extra rule 1: each property name in the property path is a
+ // valid term.
+ for (std::string_view property :
+ absl_ports::StrSplit(current_term, kPropertySeparator)) {
+ if (!SchemaUtil::ValidatePropertyName(property).ok()) {
+ return absl_ports::InvalidArgumentError(
+ GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME));
+ }
}
tokens->emplace_back(Token::QUERY_PROPERTY, current_term);
} else {
@@ -416,13 +513,11 @@ libtextclassifier3::Status OutputToken(State new_state,
// Returns:
// OK on success
// INVALID_ARGUMENT with error message on invalid query syntax
-libtextclassifier3::Status ProcessTerm(State* current_state,
- std::string_view* current_term,
- TermType* current_term_type,
- int* unclosed_parentheses_count,
- const std::string_view next_term,
- TermType next_term_type,
- std::vector<Token>* tokens) {
+libtextclassifier3::Status ProcessTerm(
+ State* current_state, std::string_view* current_term,
+ TermType* current_term_type, int* unclosed_parentheses_count,
+ const std::string_view next_term, TermType next_term_type,
+ const LanguageSegmenter* language_segmenter, std::vector<Token>* tokens) {
// Asserts extra rule 4: parentheses must appear in pairs.
if (next_term_type == LEFT_PARENTHESES) {
++(*unclosed_parentheses_count);
@@ -440,8 +535,25 @@ libtextclassifier3::Status ProcessTerm(State* current_state,
}
switch (action_or_error) {
case OUTPUT:
- ICING_RETURN_IF_ERROR(
- OutputToken(new_state, *current_term, *current_term_type, tokens));
+ if (*current_state == PROCESSING_PROPERTY_TERM_APPENDING) {
+ // We appended multiple terms together in case they actually should have
+ // been connected by a colon connector.
+ ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> content_terms,
+ language_segmenter->GetAllTerms(*current_term));
+ for (std::string_view term : content_terms) {
+ TermType type = ASCII_ALPHANUMERIC_TERM;
+ if (!i18n_utils::IsAscii(term[0])) {
+ type = NON_ASCII_ALPHABETIC_TERM;
+ } else if (!std::isalnum(term[0])) {
+ // Skip OTHER tokens here.
+ continue;
+ }
+ ICING_RETURN_IF_ERROR(OutputToken(new_state, term, type, tokens));
+ }
+ } else {
+ ICING_RETURN_IF_ERROR(
+ OutputToken(new_state, *current_term, *current_term_type, tokens));
+ }
[[fallthrough]];
case IGNORE:
*current_term = next_term;
@@ -449,6 +561,11 @@ libtextclassifier3::Status ProcessTerm(State* current_state,
break;
case KEEP:
break;
+ case CONCATENATE:
+ *current_term = std::string_view(
+ current_term->data(),
+ next_term.data() - current_term->data() + next_term.length());
+ break;
default:
return absl_ports::InvalidArgumentError(GetErrorMessage(ERROR_UNKNOWN));
}
@@ -463,56 +580,55 @@ libtextclassifier3::Status ProcessTerm(State* current_state,
// A list of tokens on success
// INVALID_ARGUMENT with error message on invalid query syntax
libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
- std::unique_ptr<LanguageSegmenter::Iterator> base_iterator) {
+ const LanguageSegmenter* language_segmenter,
+ std::vector<std::pair<TermType, std::string_view>> prescanned_terms) {
std::vector<Token> tokens;
State current_state = READY;
std::string_view current_term;
TermType current_term_type;
int unclosed_parentheses_count = 0;
- while (base_iterator->Advance()) {
- const std::string_view next_term = base_iterator->GetTerm();
- size_t colon_position = next_term.find(kColon);
- // Since colon ":" is a word connector per ICU's rule
- // (https://unicode.org/reports/tr29/#Word_Boundaries), strings like
- // "foo:bar" are returned by LanguageSegmenter as one term. Here we're
- // trying to find the first colon as it represents property restriction in
- // raw query.
- if (colon_position == std::string_view::npos) {
- // No colon found
- ICING_RETURN_IF_ERROR(ProcessTerm(&current_state, &current_term,
- &current_term_type,
- &unclosed_parentheses_count, next_term,
- GetTermType(next_term), &tokens));
- } else if (next_term.size() == 1 && next_term[0] == kColon) {
- // The whole term is a colon
+ for (int i = 0; i < prescanned_terms.size(); ++i) {
+ const std::pair<TermType, std::string_view>& prescanned_term =
+ prescanned_terms.at(i);
+ if (prescanned_term.first != ASCII_ALPHANUMERIC_TERM &&
+ prescanned_term.first != NON_ASCII_ALPHABETIC_TERM &&
+ prescanned_term.first != OTHER) {
+ // This can't be a property restrict. Just pass it in.
ICING_RETURN_IF_ERROR(
ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count, next_term, COLON, &tokens));
+ &unclosed_parentheses_count, prescanned_term.second,
+ prescanned_term.first, language_segmenter, &tokens));
} else {
- // String before the colon is the property name
- std::string_view property_name = next_term.substr(0, colon_position);
- ICING_RETURN_IF_ERROR(
- ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count, property_name,
- GetTermType(property_name), &tokens));
- ICING_RETURN_IF_ERROR(
- ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count, std::string_view(&kColon, 1),
- COLON, &tokens));
- // String after the colon is the term that property restriction is applied
- // on.
- std::string_view property_term = next_term.substr(colon_position + 1);
- ICING_RETURN_IF_ERROR(
- ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count, property_term,
- GetTermType(property_term), &tokens));
+ // There's no colon after this term. Now, we need to segment this.
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<std::string_view> content_terms,
+ language_segmenter->GetAllTerms(prescanned_term.second));
+ for (std::string_view term : content_terms) {
+ TermType type = ASCII_ALPHANUMERIC_TERM;
+ if (term == kOrOperator) {
+ // TODO(tjbarron) Decide whether we should revise this and other
+ // handled syntax. This is used to allow queries like "term1,OR,term2"
+ // to succeed. It's not clear if we should allow this or require
+ // clients to ensure that OR operators are always surrounded by
+ // whitespace.
+ type = OR_OPERATOR;
+ } else if (!i18n_utils::IsAscii(term[0])) {
+ type = NON_ASCII_ALPHABETIC_TERM;
+ } else if (!std::isalnum(term[0])) {
+ type = OTHER;
+ }
+ ICING_RETURN_IF_ERROR(ProcessTerm(&current_state, &current_term,
+ &current_term_type,
+ &unclosed_parentheses_count, term,
+ type, language_segmenter, &tokens));
+ }
}
}
// Adds a fake whitespace at the end to flush the last term.
- ICING_RETURN_IF_ERROR(
- ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count,
- std::string_view(&kWhitespace, 1), WHITESPACE, &tokens));
+ ICING_RETURN_IF_ERROR(ProcessTerm(
+ &current_state, &current_term, &current_term_type,
+ &unclosed_parentheses_count, std::string_view(&kWhitespace, 1),
+ WHITESPACE, language_segmenter, &tokens));
if (unclosed_parentheses_count > 0) {
return absl_ports::InvalidArgumentError("Unclosed left parentheses.");
}
@@ -553,10 +669,16 @@ RawQueryTokenizer::Tokenize(std::string_view text) const {
libtextclassifier3::StatusOr<std::vector<Token>> RawQueryTokenizer::TokenizeAll(
std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
- language_segmenter_.Segment(text));
- return ProcessTerms(std::move(base_iterator));
+ // 1. Prescan all terms in the text, to determine which ones are potentially
+ // content and which ones are not.
+ std::vector<std::pair<TermType, std::string_view>> prescanned_terms;
+ for (size_t pos = 0; pos < text.length();) {
+ std::pair<TermType, std::string_view> term_pair = GetTerm(text, pos);
+ pos += term_pair.second.length();
+ prescanned_terms.push_back(term_pair);
+ }
+ // 2. Process the prescanned terms, segmenting content terms as needed.
+ return ProcessTerms(&language_segmenter_, std::move(prescanned_terms));
}
} // namespace lib
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 9b71e8a..d4af9ed 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -59,6 +59,10 @@ TEST_F(RawQueryTokenizerTest, Simple) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"),
IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
EqualsToken(Token::REGULAR, "World"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"),
+ EqualsToken(Token::REGULAR, "WORLD"))));
}
TEST_F(RawQueryTokenizerTest, Parentheses) {
@@ -293,6 +297,12 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
EqualsToken(Token::REGULAR, "term2"))));
EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::REGULAR, "今天"),
+ EqualsToken(Token::REGULAR, "天气"))));
+
+ EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("property1:term1-"),
IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
EqualsToken(Token::REGULAR, "term1"))));
diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc
index 9ebbce5..9b59acf 100644
--- a/icing/tokenization/tokenizer-factory.cc
+++ b/icing/tokenization/tokenizer-factory.cc
@@ -31,14 +31,14 @@ namespace lib {
namespace tokenizer_factory {
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer>>
-CreateIndexingTokenizer(IndexingConfig::TokenizerType::Code type,
+CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type,
const LanguageSegmenter* lang_segmenter) {
ICING_RETURN_ERROR_IF_NULL(lang_segmenter);
switch (type) {
- case IndexingConfig::TokenizerType::PLAIN:
+ case StringIndexingConfig::TokenizerType::PLAIN:
return std::make_unique<PlainTokenizer>(lang_segmenter);
- case IndexingConfig::TokenizerType::NONE:
+ case StringIndexingConfig::TokenizerType::NONE:
[[fallthrough]];
default:
// This should never happen.
diff --git a/icing/tokenization/tokenizer-factory.h b/icing/tokenization/tokenizer-factory.h
index f81fd96..8b9226d 100644
--- a/icing/tokenization/tokenizer-factory.h
+++ b/icing/tokenization/tokenizer-factory.h
@@ -37,7 +37,7 @@ namespace tokenizer_factory {
// FAILED_PRECONDITION on any null pointer input
// INVALID_ARGUMENT if tokenizer type is invalid
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer>>
-CreateIndexingTokenizer(IndexingConfig::TokenizerType::Code type,
+CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type,
const LanguageSegmenter* lang_segmenter);
// All the supported query tokenizer types