// Copyright (C) 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "icing/index/index-processor.h" #include #include #include #include #include #include #include #include #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" #include "icing/absl_ports/str_join.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/integer-index.h" #include "icing/index/numeric/numeric-index.h" #include "icing/index/term-property-id.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/schema-util.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/icu-data-file-helper.h" #include "icing/testing/random-string.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" #include "icing/util/tokenized-document.h" #include "unicode/uloc.h" namespace icing { namespace lib { namespace { constexpr std::string_view kIpsumText = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla convallis " "scelerisque orci quis hendrerit. Sed augue turpis, sodales eu gravida " "nec, scelerisque nec leo. Maecenas accumsan interdum commodo. Aliquam " "mattis sapien est, sit amet interdum risus dapibus sed. Maecenas leo " "erat, fringilla in nisl a, venenatis gravida metus. Phasellus venenatis, " "orci in aliquet mattis, lectus sapien volutpat arcu, sed hendrerit ligula " "arcu nec mauris. Integer dolor mi, rhoncus eget gravida et, pulvinar et " "nunc. Aliquam ac sollicitudin nisi. Vivamus sit amet urna vestibulum, " "tincidunt eros sed, efficitur nisl. Fusce non neque accumsan, sagittis " "nisi eget, sagittis turpis. Ut pulvinar nibh eu purus feugiat faucibus. " "Donec tellus nulla, tincidunt vel lacus id, bibendum fermentum turpis. " "Nullam ultrices sed nibh vitae aliquet. Ut risus neque, consectetur " "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh " "placerat semper."; // schema types constexpr std::string_view kFakeType = "FakeType"; constexpr std::string_view kNestedType = "NestedType"; // Indexable properties and section Id. Section Id is determined by the // lexicographical order of indexable property path. constexpr std::string_view kExactProperty = "exact"; constexpr std::string_view kIndexableIntegerProperty = "indexableInteger"; constexpr std::string_view kPrefixedProperty = "prefixed"; constexpr std::string_view kRepeatedProperty = "repeated"; constexpr std::string_view kRfc822Property = "rfc822"; constexpr std::string_view kSubProperty = "submessage"; // submessage.nested constexpr std::string_view kNestedProperty = "nested"; // submessage.nested // TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export // to Android. #ifdef ENABLE_URL_TOKENIZER constexpr std::string_view kUrlExactProperty = "urlExact"; constexpr std::string_view kUrlPrefixedProperty = "urlPrefixed"; #endif // ENABLE_URL_TOKENIZER constexpr std::string_view kVerbatimExactProperty = "verbatimExact"; constexpr std::string_view kVerbatimPrefixedProperty = "verbatimPrefixed"; constexpr SectionId kExactSectionId = 0; constexpr SectionId kIndexableIntegerSectionId = 1; constexpr SectionId kPrefixedSectionId = 2; constexpr SectionId kRepeatedSectionId = 3; constexpr SectionId kRfc822SectionId = 4; constexpr SectionId kNestedSectionId = 5; // submessage.nested #ifdef ENABLE_URL_TOKENIZER constexpr SectionId kUrlExactSectionId = 6; constexpr SectionId kUrlPrefixedSectionId = 7; constexpr SectionId kVerbatimExactSectionId = 8; constexpr SectionId kVerbatimPrefixedSectionId = 9; #else // !ENABLE_URL_TOKENIZER constexpr SectionId kVerbatimExactSectionId = 6; constexpr SectionId kVerbatimPrefixedSectionId = 7; #endif // ENABLE_URL_TOKENIZER // Other non-indexable properties. constexpr std::string_view kUnindexedProperty1 = "unindexed1"; constexpr std::string_view kUnindexedProperty2 = "unindexed2"; constexpr DocumentId kDocumentId0 = 0; constexpr DocumentId kDocumentId1 = 1; using Cardinality = PropertyConfigProto::Cardinality; using DataType = PropertyConfigProto::DataType; using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsEmpty; using ::testing::IsTrue; using ::testing::SizeIs; using ::testing::Test; #ifdef ENABLE_URL_TOKENIZER constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL = StringIndexingConfig::TokenizerType::URL; #endif // ENABLE_URL_TOKENIZER class IndexProcessorTest : public Test { protected: void SetUp() override { if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { ICING_ASSERT_OK( // File generated via icu_data_file rule in //icing/BUILD. icu_data_file_helper::SetUpICUDataFile( GetTestFilePath("icing/icu.dat"))); } base_dir_ = GetTestTempDir() + "/index_processor_test"; ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), IsTrue()); index_dir_ = base_dir_ + "/index"; integer_index_dir_ = base_dir_ + "/integer_index"; schema_store_dir_ = base_dir_ + "/schema_store"; Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( integer_index_, IntegerIndex::Create(filesystem_, integer_index_dir_)); language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); ICING_ASSERT_OK_AND_ASSIGN( lang_segmenter_, language_segmenter_factory::Create(std::move(segmenter_options))); ICING_ASSERT_OK_AND_ASSIGN( normalizer_, normalizer_factory::Create( /*max_term_byte_size=*/std::numeric_limits::max())); ASSERT_TRUE( filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str())); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); SchemaProto schema = SchemaBuilder() .AddType( SchemaTypeConfigBuilder() .SetType(kFakeType) .AddProperty(PropertyConfigBuilder() .SetName(kExactProperty) .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName(kPrefixedProperty) .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName(kUnindexedProperty1) .SetDataType(TYPE_STRING) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName(kUnindexedProperty2) .SetDataType(TYPE_BYTES) .SetCardinality(CARDINALITY_OPTIONAL)) .AddProperty(PropertyConfigBuilder() .SetName(kRepeatedProperty) .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_REPEATED)) .AddProperty(PropertyConfigBuilder() .SetName(kVerbatimExactProperty) .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_VERBATIM) .SetCardinality(CARDINALITY_REPEATED)) .AddProperty(PropertyConfigBuilder() .SetName(kVerbatimPrefixedProperty) .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_VERBATIM) .SetCardinality(CARDINALITY_REPEATED)) .AddProperty(PropertyConfigBuilder() .SetName(kRfc822Property) .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822) .SetCardinality(CARDINALITY_REPEATED)) #ifdef ENABLE_URL_TOKENIZER .AddProperty( PropertyConfigBuilder() .SetName(kUrlExactProperty) .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_URL) .SetCardinality(CARDINALITY_REPEATED)) .AddProperty( PropertyConfigBuilder() .SetName(kUrlPrefixedProperty) .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_URL) .SetCardinality(CARDINALITY_REPEATED)) #endif // ENABLE_URL_TOKENIZER .AddProperty(PropertyConfigBuilder() .SetName(kIndexableIntegerProperty) .SetDataTypeInt64(NUMERIC_MATCH_RANGE) .SetCardinality(CARDINALITY_REPEATED)) .AddProperty( PropertyConfigBuilder() .SetName(kSubProperty) .SetDataTypeDocument( kNestedType, /*index_nested_properties=*/true) .SetCardinality(CARDINALITY_OPTIONAL))) .AddType( SchemaTypeConfigBuilder() .SetType(kNestedType) .AddProperty(PropertyConfigBuilder() .SetName(kNestedProperty) .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); ICING_ASSERT_OK_AND_ASSIGN( index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(), integer_index_.get(), &fake_clock_)); mock_icing_filesystem_ = std::make_unique(); } void TearDown() override { index_processor_.reset(); schema_store_.reset(); normalizer_.reset(); lang_segmenter_.reset(); integer_index_.reset(); index_.reset(); filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); } std::unique_ptr mock_icing_filesystem_; Filesystem filesystem_; IcingFilesystem icing_filesystem_; FakeClock fake_clock_; std::string base_dir_; std::string index_dir_; std::string integer_index_dir_; std::string schema_store_dir_; std::unique_ptr index_; std::unique_ptr> integer_index_; std::unique_ptr lang_segmenter_; std::unique_ptr normalizer_; std::unique_ptr schema_store_; std::unique_ptr index_processor_; }; std::vector GetHits(std::unique_ptr iterator) { std::vector infos; while (iterator->Advance().ok()) { infos.push_back(iterator->doc_hit_info()); } return infos; } std::vector GetHitsWithTermFrequency( std::unique_ptr iterator) { std::vector infos; while (iterator->Advance().ok()) { std::vector matched_terms_stats; iterator->PopulateMatchedTermsStats(&matched_terms_stats); for (const TermMatchInfo& term_match_info : matched_terms_stats) { infos.push_back(DocHitInfoTermFrequencyPair( iterator->doc_hit_info(), term_match_info.term_frequencies)); } } return infos; } TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) { EXPECT_THAT(IndexProcessor::Create(/*normalizer=*/nullptr, index_.get(), integer_index_.get(), &fake_clock_), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(IndexProcessor::Create(normalizer_.get(), /*index=*/nullptr, integer_index_.get(), &fake_clock_), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_F(IndexProcessorTest, NoTermMatchTypeContent) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kUnindexedProperty1), "foo bar baz") .AddBytesProperty(std::string(kUnindexedProperty2), "attachment bytes") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexProcessorTest, NoValidContent) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "?...!") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexProcessorTest, OneDoc) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "hello world") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("hello", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); std::unordered_map expectedMap{ {kExactSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expectedMap))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator( "hello", /*term_start_index=*/0, /*unnormalized_term_length=*/0, 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY)); EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); } TEST_F(IndexProcessorTest, MultipleDocs) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "hello world") .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); std::string coffeeRepeatedString = "coffee"; for (int i = 0; i < Hit::kMaxTermFrequency + 1; i++) { coffeeRepeatedString += " coffee"; } document = DocumentBuilder() .SetKey("icing", "fake_type/2") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), coffeeRepeatedString) .AddStringProperty(std::string(kPrefixedProperty), "mr. world world wide") .Build(); ICING_ASSERT_OK_AND_ASSIGN( tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("world", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); std::unordered_map expectedMap1{ {kPrefixedSectionId, 2}}; std::unordered_map expectedMap2{ {kExactSectionId, 1}}; EXPECT_THAT( hits, ElementsAre( EqualsDocHitInfoWithTermFrequency(kDocumentId1, expectedMap1), EqualsDocHitInfoWithTermFrequency(kDocumentId0, expectedMap2))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator( "world", /*term_start_index=*/0, /*unnormalized_term_length=*/0, 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); std::unordered_map expectedMap{ {kPrefixedSectionId, 2}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId1, expectedMap))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("coffee", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); expectedMap = {{kExactSectionId, Hit::kMaxTermFrequency}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId1, expectedMap))); } TEST_F(IndexProcessorTest, DocWithNestedProperty) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "hello world") .AddDocumentProperty( std::string(kSubProperty), DocumentBuilder() .SetKey("icing", "nested_type/1") .SetSchema(std::string(kNestedType)) .AddStringProperty(std::string(kNestedProperty), "rocky raccoon") .Build()) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("rocky", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector{kNestedSectionId}))); } TEST_F(IndexProcessorTest, DocWithRepeatedProperty) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "hello world") .AddStringProperty(std::string(kRepeatedProperty), "rocky", "italian stallion") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("italian", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector{kRepeatedSectionId}))); } // TODO(b/196771754) This test is disabled on Android because it takes too long // to generate all of the unique terms and the test times out. Try storing these // unique terms in a file that the test can read from. #ifndef __ANDROID__ TEST_F(IndexProcessorTest, HitBufferExhaustedTest) { // Testing has shown that adding ~600,000 hits will fill up the hit buffer. std::vector unique_terms_ = GenerateUniqueTerms(200000); std::string content = absl_ports::StrJoin(unique_terms_, " "); DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), content) .AddStringProperty(std::string(kPrefixedProperty), content) .AddStringProperty(std::string(kRepeatedProperty), content) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED, testing::HasSubstr("Hit buffer is full!"))); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexProcessorTest, LexiconExhaustedTest) { // Testing has shown that adding ~300,000 terms generated this way will // fill up the lexicon. std::vector unique_terms_ = GenerateUniqueTerms(300000); std::string content = absl_ports::StrJoin(unique_terms_, " "); DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), content) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } #endif // __ANDROID__ TEST_F(IndexProcessorTest, TooLongTokens) { // Only allow the tokens of length four, truncating "hello", "world" and // "night". ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr normalizer, normalizer_factory::Create( /*max_term_byte_size=*/4)); ICING_ASSERT_OK_AND_ASSIGN( index_processor_, IndexProcessor::Create(normalizer.get(), index_.get(), integer_index_.get(), &fake_clock_)); DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "hello world") .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); // "good" should have been indexed normally. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("good", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector{kPrefixedSectionId}))); // "night" should not have been. ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("night", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); // "night" should have been truncated to "nigh". ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("nigh", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector{kPrefixedSectionId}))); } TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "best rocky movies") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); document = DocumentBuilder() .SetKey("icing", "fake_type/2") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon") .Build(); ICING_ASSERT_OK_AND_ASSIGN( tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); // Only document_id 1 should surface in a prefix query for "Rock" ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("rock", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId1, std::vector{kPrefixedSectionId}))); } TEST_F(IndexProcessorTest, TokenNormalization) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); document = DocumentBuilder() .SetKey("icing", "fake_type/2") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "all lower case") .Build(); ICING_ASSERT_OK_AND_ASSIGN( tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("case", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT( GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo(kDocumentId1, std::vector{kExactSectionId}), EqualsDocHitInfo(kDocumentId0, std::vector{kExactSectionId}))); } TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE") .AddInt64Property(std::string(kIndexableIntegerProperty), 123) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size, index_->GetElementsSize()); ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc, integer_index_->UpdateChecksums()); // Indexing a document with document_id <= last_added_document_id should cause // a failure. document = DocumentBuilder() .SetKey("icing", "fake_type/2") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "all lower case") .AddInt64Property(std::string(kIndexableIntegerProperty), 456) .Build(); ICING_ASSERT_OK_AND_ASSIGN( tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // Verify that both index_ and integer_index_ are unchanged. EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size)); EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1)); EXPECT_THAT(integer_index_->UpdateChecksums(), IsOkAndHolds(integer_index_crc)); // As should indexing a document document_id == last_added_document_id. EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // Verify that both index_ and integer_index_ are unchanged. EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size)); EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1)); EXPECT_THAT(integer_index_->UpdateChecksums(), IsOkAndHolds(integer_index_crc)); } TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr index_processor, IndexProcessor::Create(normalizer_.get(), index_.get(), integer_index_.get(), &fake_clock_, /*recovery_mode=*/true)); DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE") .AddInt64Property(std::string(kIndexableIntegerProperty), 123) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId1), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size, index_->GetElementsSize()); ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc, integer_index_->UpdateChecksums()); // Indexing a document with document_id <= last_added_document_id in recovery // mode should not get any error, but IndexProcessor should still ignore it // and index data should remain unchanged. document = DocumentBuilder() .SetKey("icing", "fake_type/2") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "all lower case") .AddInt64Property(std::string(kIndexableIntegerProperty), 456) .Build(); ICING_ASSERT_OK_AND_ASSIGN( tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId0), IsOk()); // Verify that both index_ and integer_index_ are unchanged. EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size)); EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1)); EXPECT_THAT(integer_index_->UpdateChecksums(), IsOkAndHolds(integer_index_crc)); // As should indexing a document document_id == last_added_document_id. EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId1), IsOk()); // Verify that both index_ and integer_index_ are unchanged. EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size)); EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1)); EXPECT_THAT(integer_index_->UpdateChecksums(), IsOkAndHolds(integer_index_crc)); } TEST_F(IndexProcessorTest, NonAsciiIndexing) { language_segmenter_factory::SegmenterOptions segmenter_options( ULOC_SIMPLIFIED_CHINESE); ICING_ASSERT_OK_AND_ASSIGN( lang_segmenter_, language_segmenter_factory::Create(std::move(segmenter_options))); DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), "你好,世界!你好:世界。“你好”世界?") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("你好", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector{kExactSectionId}))); } TEST_F(IndexProcessorTest, LexiconFullIndexesSmallerTokensReturnsResourceExhausted) { // This is the maximum token length that an empty lexicon constructed for a // lite index with merge size of 1MiB can support. constexpr int kMaxTokenLength = 16777217; // Create a string "ppppppp..." with a length that is too large to fit into // the lexicon. std::string enormous_string(kMaxTokenLength + 1, 'p'); DocumentProto document_one = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), absl_ports::StrCat(enormous_string, " foo")) .AddStringProperty(std::string(kPrefixedProperty), "bar baz") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document_one)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { // Create the index with a smaller index_merge_size - merging every time we // add 101 documents. This will result in a small LiteIndex, which will be // easier to fill up. The LiteIndex itself will have a size larger than the // index_merge_size because it adds extra buffer to ensure that it always has // room to fit whatever document will trigger the merge. DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kExactProperty), kIpsumText) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); Index::Options options(index_dir_, /*index_merge_size=*/document.ByteSizeLong() * 100); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(), integer_index_.get(), &fake_clock_)); DocumentId doc_id = 0; // Have determined experimentally that indexing 3373 documents with this text // will cause the LiteIndex to fill up. Further indexing will fail unless the // index processor properly merges the LiteIndex into the MainIndex and // empties the LiteIndex. constexpr int kNumDocsLiteIndexExhaustion = 3373; for (; doc_id < kNumDocsLiteIndexExhaustion; ++doc_id) { EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { // 1. Setup a mock filesystem to fail to grow the main index. auto open_write_lambda = [this](const char* filename) { std::string main_lexicon_suffix = "/main-lexicon.prop." + std::to_string(GetHasHitsInPrefixSectionPropertyId()); std::string filename_string(filename); if (filename_string.length() >= main_lexicon_suffix.length() && filename_string.substr( filename_string.length() - main_lexicon_suffix.length(), main_lexicon_suffix.length()) == main_lexicon_suffix) { return -1; } return this->filesystem_.OpenForWrite(filename); }; ON_CALL(*mock_icing_filesystem_, OpenForWrite) .WillByDefault(open_write_lambda); DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kPrefixedProperty), kIpsumText) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); // 2. Recreate the index with the mock filesystem and a merge size that will // only allow one document to be added before requiring a merge. Index::Options options(index_dir_, /*index_merge_size=*/document.ByteSizeLong()); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, mock_icing_filesystem_.get())); ICING_ASSERT_OK_AND_ASSIGN( index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(), integer_index_.get(), &fake_clock_)); // 3. Index one document. This should fit in the LiteIndex without requiring a // merge. DocumentId doc_id = 0; EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); // 4. Add one more document to trigger a merge, which should fail and result // in a Reset. ++doc_id; EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), StatusIs(libtextclassifier3::StatusCode::DATA_LOSS)); EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); // 5. Indexing a new document should succeed. EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } TEST_F(IndexProcessorTest, ExactVerbatimProperty) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kVerbatimExactProperty), "Hello, world!") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("Hello, world!", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); std::unordered_map expectedMap{ {kVerbatimExactSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expectedMap))); } TEST_F(IndexProcessorTest, PrefixVerbatimProperty) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kVerbatimPrefixedProperty), "Hello, world!") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); // We expect to match the document we indexed as "Hello, w" is a prefix // of "Hello, world!" ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("Hello, w", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); std::unordered_map expectedMap{ {kVerbatimPrefixedSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expectedMap))); } TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kVerbatimPrefixedProperty), "Hello, world!") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("world", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); std::vector hits = GetHits(std::move(itr)); // We should not have hits for term "world" as the index processor should // create a sole token "Hello, world! for the document. EXPECT_THAT(hits, IsEmpty()); } // Some phrases that should match exactly to RFC822 tokens. We normalize the // tokens, so the case of the string property shouldn't matter. TEST_F(IndexProcessorTest, Rfc822PropertyExact) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kRfc822Property), "") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); std::unordered_map expected_map{ {kRfc822SectionId, 2}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("alexsav", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); expected_map = {{kRfc822SectionId, 1}}; ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("com", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("alexsav@google.com", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); } TEST_F(IndexProcessorTest, Rfc822PropertyExactShouldNotReturnPrefix) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kRfc822Property), "") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); std::unordered_map expected_map{ {kRfc822SectionId, 2}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("alexsa", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); std::vector hits = GetHits(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); } // Some prefixes of generated RFC822 tokens. #ifdef ENABLE_RFC822_PROPERTY_PREFIX_TEST // ENABLE_RFC822_PROPERTY_PREFIX_TEST won't be defined, so this test will not be // compiled. // TODO(b/250648165): Remove #ifdef to enable this test after fixing the // indeterministic behavior of prefix query term frequency in // lite index. // TEST_F(IndexProcessorTest, Rfc822PropertyPrefix) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kRfc822Property), "") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); std::unordered_map expected_map{ {kRfc822SectionId, 1}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("alexsav@", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("goog", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("ale", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); } #endif // ENABLE_RFC822_PROPERTY_PREFIX_TEST TEST_F(IndexProcessorTest, Rfc822PropertyNoMatch) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kRfc822Property), "") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); std::unordered_map expect_map{{}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("abc.xyz", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); std::vector hits = GetHits(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); } #ifdef ENABLE_URL_TOKENIZER TEST_F(IndexProcessorTest, ExactUrlProperty) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kUrlExactProperty), "http://www.google.com") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("google", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); std::unordered_map expected_map{ {kUrlExactSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("http", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); expected_map = {{kUrlExactSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("www.google.com", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); expected_map = {{kUrlExactSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("http://www.google.com", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); expected_map = {{kUrlExactSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); } TEST_F(IndexProcessorTest, ExactUrlPropertyDoesNotMatchPrefix) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kUrlExactProperty), "https://mail.google.com/calendar/render") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("co", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("mail.go", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("mail.google.com", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); } TEST_F(IndexProcessorTest, PrefixUrlProperty) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kUrlPrefixedProperty), "http://www.google.com") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); // "goo" is a prefix of "google" and "google.com" ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("goo", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); std::unordered_map expected_map{ {kUrlPrefixedSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); // "http" is a prefix of "http" and "http://www.google.com" ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("http", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); hits = GetHitsWithTermFrequency(std::move(itr)); expected_map = {{kUrlPrefixedSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); // "www.go" is a prefix of "www.google.com" ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("www.go", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); hits = GetHitsWithTermFrequency(std::move(itr)); expected_map = {{kUrlPrefixedSectionId, 1}}; EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( kDocumentId0, expected_map))); } TEST_F(IndexProcessorTest, PrefixUrlPropertyNoMatch) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddStringProperty(std::string(kUrlPrefixedProperty), "https://mail.google.com/calendar/render") .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); // no token starts with "gle", so we should have no hits ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, index_->GetIterator("gle", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); std::vector hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("w.goo", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); // tokens have separators removed, so no hits here ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator(".com", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); ICING_ASSERT_OK_AND_ASSIGN( itr, index_->GetIterator("calendar/render", /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, TermMatchType::PREFIX)); hits = GetHitsWithTermFrequency(std::move(itr)); EXPECT_THAT(hits, IsEmpty()); } #endif // ENABLE_URL_TOKENIZER TEST_F(IndexProcessorTest, IndexableIntegerProperty) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4, 5) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); // Expected to have 1 integer section. EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/1, /*key_upper=*/5)); EXPECT_THAT( GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector{kIndexableIntegerSectionId}))); } TEST_F(IndexProcessorTest, IndexableIntegerPropertyNoMatch) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/1") .SetSchema(std::string(kFakeType)) .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4, 5) .Build(); ICING_ASSERT_OK_AND_ASSIGN( TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); // Expected to have 1 integer section. EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr itr, integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/-1, /*key_upper=*/0)); EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); } } // namespace } // namespace lib } // namespace icing