// Copyright (C) 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "icing/icing-search-engine.h" #include #include #include #include #include #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/jni/jni-cache.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/debug.pb.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/initialize.pb.h" #include "icing/proto/logging.pb.h" #include "icing/proto/optimize.pb.h" #include "icing/proto/persist.pb.h" #include "icing/proto/reset.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" #include "icing/proto/search.pb.h" #include "icing/proto/status.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/proto/usage.pb.h" #include "icing/schema-builder.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/random-string.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" namespace icing { namespace lib { namespace { using ::testing::Eq; using ::testing::Ge; using ::testing::HasSubstr; using ::testing::IsEmpty; using ::testing::Le; using ::testing::SizeIs; constexpr std::string_view kIpsumText = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla convallis " "scelerisque orci quis hendrerit. Sed augue turpis, sodales eu gravida " "nec, scelerisque nec leo. Maecenas accumsan interdum commodo. Aliquam " "mattis sapien est, sit amet interdum risus dapibus sed. Maecenas leo " "erat, fringilla in nisl a, venenatis gravida metus. Phasellus venenatis, " "orci in aliquet mattis, lectus sapien volutpat arcu, sed hendrerit ligula " "arcu nec mauris. Integer dolor mi, rhoncus eget gravida et, pulvinar et " "nunc. Aliquam ac sollicitudin nisi. Vivamus sit amet urna vestibulum, " "tincidunt eros sed, efficitur nisl. Fusce non neque accumsan, sagittis " "nisi eget, sagittis turpis. Ut pulvinar nibh eu purus feugiat faucibus. " "Donec tellus nulla, tincidunt vel lacus id, bibendum fermentum turpis. " "Nullam ultrices sed nibh vitae aliquet. Ut risus neque, consectetur " "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh " "placerat semper."; // For mocking purpose, we allow tests to provide a custom Filesystem. class TestIcingSearchEngine : public IcingSearchEngine { public: TestIcingSearchEngine(const IcingSearchEngineOptions& options, std::unique_ptr filesystem, std::unique_ptr icing_filesystem, std::unique_ptr clock, std::unique_ptr jni_cache) : IcingSearchEngine(options, std::move(filesystem), std::move(icing_filesystem), std::move(clock), std::move(jni_cache)) {} }; std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; } // This test is meant to cover all tests relating to IcingSearchEngine::Put. class IcingSearchEnginePutTest : public testing::Test { protected: void SetUp() override { if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { // If we've specified using the reverse-JNI method for segmentation (i.e. // not ICU), then we won't have the ICU data file included to set up. // Technically, we could choose to use reverse-JNI for segmentation AND // include an ICU data file, but that seems unlikely and our current BUILD // setup doesn't do this. // File generated via icu_data_file rule in //icing/BUILD. std::string icu_data_file_path = GetTestFilePath("icing/icu.dat"); ICING_ASSERT_OK( icu_data_file_helper::SetUpICUDataFile(icu_data_file_path)); } filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str()); } void TearDown() override { filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str()); } const Filesystem* filesystem() const { return &filesystem_; } private: Filesystem filesystem_; }; constexpr int kMaxSupportedDocumentSize = (1u << 24) - 1; // Non-zero value so we don't override it to be the current time constexpr int64_t kDefaultCreationTimestampMs = 1575492852000; std::string GetIndexDir() { return GetTestBaseDir() + "/index_dir"; } IcingSearchEngineOptions GetDefaultIcingOptions() { IcingSearchEngineOptions icing_options; icing_options.set_base_dir(GetTestBaseDir()); return icing_options; } DocumentProto CreateMessageDocument(std::string name_space, std::string uri) { return DocumentBuilder() .SetKey(std::move(name_space), std::move(uri)) .SetSchema("Message") .AddStringProperty("body", "message body") .SetCreationTimestampMs(kDefaultCreationTimestampMs) .Build(); } SchemaProto CreateMessageSchema() { return SchemaBuilder() .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( PropertyConfigBuilder() .SetName("body") .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); } ScoringSpecProto GetDefaultScoringSpec() { ScoringSpecProto scoring_spec; scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); return scoring_spec; } TEST_F(IcingSearchEnginePutTest, MaxTokenLenReturnsOkAndTruncatesTokens) { IcingSearchEngineOptions options = GetDefaultIcingOptions(); // A length of 1 is allowed - even though it would be strange to want // this. options.set_max_token_length(1); IcingSearchEngine icing(options, GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); DocumentProto document = CreateMessageDocument("namespace", "uri"); EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); // "message" should have been truncated to "m" SearchSpecProto search_spec; search_spec.set_term_match_type(TermMatchType::PREFIX); // The indexed tokens were truncated to length of 1, so "m" will match search_spec.set_query("m"); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); *expected_search_result_proto.mutable_results()->Add()->mutable_document() = document; SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); // The query token is also truncated to length of 1, so "me"->"m" matches "m" search_spec.set_query("me"); actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); // The query token is still truncated to length of 1, so "massage"->"m" // matches "m" search_spec.set_query("massage"); actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); } TEST_F(IcingSearchEnginePutTest, MaxIntMaxTokenLenReturnsOkTooLargeTokenReturnsResourceExhausted) { IcingSearchEngineOptions options = GetDefaultIcingOptions(); // Set token length to max. This is allowed (it just means never to // truncate tokens). However, this does mean that tokens that exceed the // size of the lexicon will cause indexing to fail. options.set_max_token_length(std::numeric_limits::max()); IcingSearchEngine icing(options, GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); // Add a document that just barely fits under the max document limit. // This will still fail to index because we won't actually have enough // room in the lexicon to fit this content. std::string enormous_string(kMaxSupportedDocumentSize - 256, 'p'); DocumentProto document = DocumentBuilder() .SetKey("namespace", "uri") .SetSchema("Message") .AddStringProperty("body", std::move(enormous_string)) .Build(); EXPECT_THAT(icing.Put(document).status(), ProtoStatusIs(StatusProto::OUT_OF_SPACE)); SearchSpecProto search_spec; search_spec.set_query("p"); search_spec.set_term_match_type(TermMatchType::PREFIX); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); } TEST_F(IcingSearchEnginePutTest, PutWithoutSchemaFailedPrecondition) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); DocumentProto document = CreateMessageDocument("namespace", "uri"); PutResultProto put_result_proto = icing.Put(document); EXPECT_THAT(put_result_proto.status(), ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); EXPECT_THAT(put_result_proto.status().message(), HasSubstr("Schema not set")); } TEST_F(IcingSearchEnginePutTest, IndexingDocMergeFailureResets) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/0") .SetSchema("Message") .AddStringProperty("body", kIpsumText) .Build(); // 1. Create an index with a LiteIndex that will only allow one document // before needing a merge. { IcingSearchEngineOptions options = GetDefaultIcingOptions(); options.set_index_merge_size(document.ByteSizeLong()); IcingSearchEngine icing(options, GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); // Add two documents. These should get merged into the main index. EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); document = DocumentBuilder(document).SetUri("fake_type/1").Build(); EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); // Add one document. This one should get remain in the lite index. document = DocumentBuilder(document).SetUri("fake_type/2").Build(); EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); } // 2. Delete the index file to trigger RestoreIndexIfNeeded. std::string idx_subdir = GetIndexDir() + "/idx"; filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str()); // 3. Setup a mock filesystem to fail to grow the main index once. bool has_failed_already = false; auto open_write_lambda = [this, &has_failed_already](const char* filename) { std::string main_lexicon_suffix = "/main-lexicon.prop.2"; std::string filename_string(filename); if (!has_failed_already && filename_string.length() >= main_lexicon_suffix.length() && filename_string.substr( filename_string.length() - main_lexicon_suffix.length(), main_lexicon_suffix.length()) == main_lexicon_suffix) { has_failed_already = true; return -1; } return this->filesystem()->OpenForWrite(filename); }; auto mock_icing_filesystem = std::make_unique(); ON_CALL(*mock_icing_filesystem, OpenForWrite) .WillByDefault(open_write_lambda); // 4. Create the index again. This should trigger index restoration. { IcingSearchEngineOptions options = GetDefaultIcingOptions(); options.set_index_merge_size(document.ByteSizeLong()); TestIcingSearchEngine icing(options, std::make_unique(), std::move(mock_icing_filesystem), std::make_unique(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoStatusIs(StatusProto::WARNING_DATA_LOSS)); SearchSpecProto search_spec; search_spec.set_query("consectetur"); search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto results = icing.Search(search_spec, ScoringSpecProto::default_instance(), ResultSpecProto::default_instance()); EXPECT_THAT(results.status(), ProtoIsOk()); EXPECT_THAT(results.next_page_token(), Eq(0)); // Only the last document that was added should still be retrievable. ASSERT_THAT(results.results(), SizeIs(1)); EXPECT_THAT(results.results(0).document().uri(), Eq("fake_type/2")); } } TEST_F(IcingSearchEnginePutTest, PutDocumentShouldLogFunctionLatency) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/0") .SetSchema("Message") .AddStringProperty("body", "message body") .Build(); auto fake_clock = std::make_unique(); fake_clock->SetTimerElapsedMilliseconds(10); TestIcingSearchEngine icing(GetDefaultIcingOptions(), std::make_unique(), std::make_unique(), std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); PutResultProto put_result_proto = icing.Put(document); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); EXPECT_THAT(put_result_proto.put_document_stats().latency_ms(), Eq(10)); } TEST_F(IcingSearchEnginePutTest, PutDocumentShouldLogDocumentStoreStats) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/0") .SetSchema("Message") .SetCreationTimestampMs(kDefaultCreationTimestampMs) .AddStringProperty("body", "message body") .Build(); auto fake_clock = std::make_unique(); fake_clock->SetTimerElapsedMilliseconds(10); TestIcingSearchEngine icing(GetDefaultIcingOptions(), std::make_unique(), std::make_unique(), std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); PutResultProto put_result_proto = icing.Put(document); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); EXPECT_THAT(put_result_proto.put_document_stats().document_store_latency_ms(), Eq(10)); size_t document_size = put_result_proto.put_document_stats().document_size(); EXPECT_THAT(document_size, Ge(document.ByteSizeLong())); EXPECT_THAT(document_size, Le(document.ByteSizeLong() + sizeof(DocumentProto::InternalFields))); } TEST_F(IcingSearchEnginePutTest, PutDocumentShouldLogIndexingStats) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/0") .SetSchema("Message") .AddStringProperty("body", "message body") .Build(); auto fake_clock = std::make_unique(); fake_clock->SetTimerElapsedMilliseconds(10); TestIcingSearchEngine icing(GetDefaultIcingOptions(), std::make_unique(), std::make_unique(), std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); PutResultProto put_result_proto = icing.Put(document); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); EXPECT_THAT(put_result_proto.put_document_stats().index_latency_ms(), Eq(10)); // No merge should happen. EXPECT_THAT(put_result_proto.put_document_stats().index_merge_latency_ms(), Eq(0)); // The input document has 2 tokens. EXPECT_THAT(put_result_proto.put_document_stats() .tokenization_stats() .num_tokens_indexed(), Eq(2)); } TEST_F(IcingSearchEnginePutTest, PutDocumentShouldLogIndexMergeLatency) { DocumentProto document1 = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema("Message") .AddStringProperty("body", kIpsumText) .Build(); DocumentProto document2 = DocumentBuilder() .SetKey("icing", "fake_type/2") .SetSchema("Message") .AddStringProperty("body", kIpsumText) .Build(); // Create an icing instance with index_merge_size = document1's size. IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); icing_options.set_index_merge_size(document1.ByteSizeLong()); auto fake_clock = std::make_unique(); fake_clock->SetTimerElapsedMilliseconds(10); TestIcingSearchEngine icing(icing_options, std::make_unique(), std::make_unique(), std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk()); // Putting document2 should trigger an index merge. PutResultProto put_result_proto = icing.Put(document2); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); EXPECT_THAT(put_result_proto.put_document_stats().index_merge_latency_ms(), Eq(10)); } TEST_F(IcingSearchEnginePutTest, PutDocumentIndexFailureDeletion) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); // Testing has shown that adding ~600,000 terms generated this way will // fill up the hit buffer. std::vector terms = GenerateUniqueTerms(600000); std::string content = absl_ports::StrJoin(terms, " "); DocumentProto document = DocumentBuilder() .SetKey("namespace", "uri1") .SetSchema("Message") .AddStringProperty("body", "foo " + content) .Build(); // We failed to add the document to the index fully. This means that we should // reject the document from Icing entirely. ASSERT_THAT(icing.Put(document).status(), ProtoStatusIs(StatusProto::OUT_OF_SPACE)); // Make sure that the document isn't searchable. SearchSpecProto search_spec; search_spec.set_query("foo"); search_spec.set_term_match_type(TERM_MATCH_PREFIX); SearchResultProto search_results = icing.Search(search_spec, ScoringSpecProto::default_instance(), ResultSpecProto::default_instance()); ASSERT_THAT(search_results.status(), ProtoIsOk()); ASSERT_THAT(search_results.results(), IsEmpty()); // Make sure that the document isn't retrievable. GetResultProto get_result = icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()); ASSERT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND)); } } // namespace } // namespace lib } // namespace icing