From 7edd8a42d2ad4b80e95557a7e4c615f97450b968 Mon Sep 17 00:00:00 2001 From: Yogesh Singh Date: Sat, 5 Aug 2023 21:06:22 +0000 Subject: Update Icing from upstream. Descriptions: ======================================================================== Add join test for SetSchemaNewIndexedDocumentPropertyTriggersIndexRestorationAndReturnsOk ======================================================================== Support different sets of properties to search for different schema types. ======================================================================== Do not cache the section mask for a schema type if no property filters apply while processing the type property filters. ======================================================================== Bug: 288310393 Bug: 291019114 Change-Id: Icb867b40a603c0d63e2d96a0e2ec641e424aad7e --- icing/icing-search-engine_schema_test.cc | 134 +++-- icing/icing-search-engine_search_test.cc | 541 +++++++++++++++++++++ .../doc-hit-info-iterator-section-restrict.cc | 179 +++++-- .../doc-hit-info-iterator-section-restrict.h | 52 +- .../doc-hit-info-iterator-section-restrict_test.cc | 4 +- icing/query/query-processor.cc | 7 + icing/query/query-processor_test.cc | 255 ++++++++++ icing/schema/schema-store.cc | 11 + icing/schema/schema-store.h | 9 + proto/icing/proto/search.proto | 12 +- synced_AOSP_CL_number.txt | 2 +- 11 files changed, 1138 insertions(+), 68 deletions(-) diff --git a/icing/icing-search-engine_schema_test.cc b/icing/icing-search-engine_schema_test.cc index 5076d50..0e88c5a 100644 --- a/icing/icing-search-engine_schema_test.cc +++ b/icing/icing-search-engine_schema_test.cc @@ -897,8 +897,17 @@ TEST_F( // - "age": integer type, indexed. Section id = 0 // - "name": string type, indexed. Section id = 1. // - "worksFor.name": string type, (nested) indexed. Section id = 2. + // + // Joinable property id assignment for 'Person': + // - "worksFor.listRef": string type, Qualified Id type joinable. Joinable + // property id = 0. SchemaProto schema_one = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("List").AddProperty( + PropertyConfigBuilder() + .SetName("title") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) .AddType(SchemaTypeConfigBuilder() .SetType("Person") .AddProperty(PropertyConfigBuilder() @@ -922,38 +931,47 @@ TEST_F( .SetName("name") .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("listRef") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); - - SetSchemaResultProto set_schema_result = icing.SetSchema(schema_one); - // Ignore latency numbers. They're covered elsewhere. - set_schema_result.clear_latency_ms(); - SetSchemaResultProto expected_set_schema_result; - expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); - expected_set_schema_result.mutable_new_schema_types()->Add("Organization"); - expected_set_schema_result.mutable_new_schema_types()->Add("Person"); - EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); - - DocumentProto personDocument = + ASSERT_THAT(icing.SetSchema(schema_one).status(), ProtoIsOk()); + + DocumentProto list_document = DocumentBuilder() + .SetKey("namespace", "list/1") + .SetSchema("List") + .SetCreationTimestampMs(1000) + .AddStringProperty("title", "title") + .Build(); + DocumentProto person_document = DocumentBuilder() .SetKey("namespace", "person/2") .SetSchema("Person") .SetCreationTimestampMs(1000) .AddStringProperty("name", "John") .AddInt64Property("age", 20) - .AddDocumentProperty("worksFor", - DocumentBuilder() - .SetKey("namespace", "org/1") - .SetSchema("Organization") - .AddStringProperty("name", "Google") - .Build()) + .AddDocumentProperty( + "worksFor", DocumentBuilder() + .SetKey("namespace", "org/1") + .SetSchema("Organization") + .AddStringProperty("name", "Google") + .AddStringProperty("listRef", "namespace#list/1") + .Build()) .Build(); - EXPECT_THAT(icing.Put(personDocument).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(list_document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person_document).status(), ProtoIsOk()); + + ResultSpecProto result_spec = ResultSpecProto::default_instance(); + result_spec.set_max_joined_children_per_parent_to_return( + std::numeric_limits::max()); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - personDocument; + person_document; SearchResultProto empty_result; empty_result.mutable_status()->set_code(StatusProto::OK); @@ -964,8 +982,7 @@ TEST_F( search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto actual_results = - icing.Search(search_spec1, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); + icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); @@ -976,11 +993,44 @@ TEST_F( SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); search_spec2.add_enabled_features(std::string(kNumericSearchFeature)); - actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); + actual_results = + icing.Search(search_spec2, GetDefaultScoringSpec(), result_spec); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); + // Verify join search: join a query for `title:title` (which will get + // list_document) with a child query for `name:John` (which will get + // person_document) based on the child's `worksFor.listRef` field. + SearchSpecProto search_spec_with_join; + search_spec_with_join.set_query("title:title"); + search_spec_with_join.set_term_match_type(TermMatchType::EXACT_ONLY); + JoinSpecProto* join_spec = search_spec_with_join.mutable_join_spec(); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("worksFor.listRef"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("name:John"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = result_spec; + + SearchResultProto expected_join_search_result_proto; + expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_join_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = list_document; + *result_proto->mutable_joined_results()->Add()->mutable_document() = + person_document; + + actual_results = + icing.Search(search_spec_with_join, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_join_search_result_proto)); + // Change the schema to add another nested document property to 'Person' // // New section id assignment for 'Person': @@ -988,8 +1038,19 @@ TEST_F( // - "almaMater.name", string type, indexed. Section id = 1 // - "name": string type, indexed. Section id = 2 // - "worksFor.name": string type, (nested) indexed. Section id = 3 + // + // New joinable property id assignment for 'Person': + // - "almaMater.listRef": string type, Qualified Id type joinable. Joinable + // property id = 0. + // - "worksFor.listRef": string type, Qualified Id type joinable. Joinable + // property id = 1. SchemaProto schema_two = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("List").AddProperty( + PropertyConfigBuilder() + .SetName("title") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) .AddType(SchemaTypeConfigBuilder() .SetType("Person") .AddProperty(PropertyConfigBuilder() @@ -1019,6 +1080,11 @@ TEST_F( .SetName("name") .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("listRef") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) .SetCardinality(CARDINALITY_REQUIRED))) .Build(); @@ -1028,10 +1094,10 @@ TEST_F( // Index restoration should be triggered here because new schema requires more // properties to be indexed. Also new section ids will be reassigned and index // restoration should use new section ids to rebuild. - set_schema_result = icing.SetSchema(schema_two); + SetSchemaResultProto set_schema_result = icing.SetSchema(schema_two); // Ignore latency numbers. They're covered elsewhere. set_schema_result.clear_latency_ms(); - expected_set_schema_result = SetSchemaResultProto(); + SetSchemaResultProto expected_set_schema_result = SetSchemaResultProto(); expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); expected_set_schema_result.mutable_index_incompatible_changed_schema_types() ->Add("Person"); @@ -1041,8 +1107,8 @@ TEST_F( // Verify term search: // Searching for "worksFor.name:Google" should still match document - actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); + actual_results = + icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); @@ -1051,16 +1117,22 @@ TEST_F( // rebuild was not triggered and Icing is still searching the old index, where // 'worksFor.name' was indexed at section id 2. search_spec1.set_query("name:Google"); - actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); + actual_results = + icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(empty_result)); // Verify numeric (integer) search: should still match document - actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); + actual_results = + icing.Search(search_spec2, GetDefaultScoringSpec(), result_spec); EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); + + // Verify join search: should still able to join by `worksFor.listRef` + actual_results = + icing.Search(search_spec_with_join, GetDefaultScoringSpec(), result_spec); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_join_search_result_proto)); } TEST_F(IcingSearchEngineSchemaTest, diff --git a/icing/icing-search-engine_search_test.cc b/icing/icing-search-engine_search_test.cc index fd2c939..451c9ce 100644 --- a/icing/icing-search-engine_search_test.cc +++ b/icing/icing-search-engine_search_test.cc @@ -3565,6 +3565,547 @@ TEST_P(IcingSearchEngineSearchTest, SearchWithProjectionMultipleFieldPaths) { EqualsProto(projected_document_one)); } +TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFilters) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filters of sender.name and subject for the + // Email schema type. + auto search_spec = std::make_unique(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("sender.name"); + email_property_filters->add_paths("subject"); + + auto result_spec = std::make_unique(); + + auto scoring_spec = std::make_unique(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the first document is returned. Although 'hello' is + // present in document_two, it shouldn't be in the result since 'hello' is not + // in the specified property filter. + EXPECT_THAT(results.results(0).document(), + EqualsProto(document_one)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFiltersOnMultipleSchema) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + // Add Person and Organization schema with a property 'name' in both. + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Organization") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("address") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema).status(), + ProtoIsOk()); + + // 1. Add person document + DocumentProto person_document = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build(); + ASSERT_THAT(icing.Put(person_document).status(), ProtoIsOk()); + + // 1. Add organization document + DocumentProto organization_document = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Organization") + .AddStringProperty("name", "Meg Corp") + .AddStringProperty("address", "Universal street") + .Build(); + ASSERT_THAT(icing.Put(organization_document).status(), ProtoIsOk()); + + // 2. Issue a query with property filters. Person schema has name in it's + // property filter but Organization schema doesn't. + auto search_spec = std::make_unique(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("Meg"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* person_property_filters = + search_spec->add_type_property_filters(); + person_property_filters->set_schema_type("Person"); + person_property_filters->add_paths("name"); + TypePropertyMask* organization_property_filters = + search_spec->add_type_property_filters(); + organization_property_filters->set_schema_type("Organization"); + organization_property_filters->add_paths("address"); + + auto result_spec = std::make_unique(); + + auto scoring_spec = std::make_unique(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the person document is returned. Although 'Meg' is + // present in organization document, it shouldn't be in the result since + // the name field is not specified in the Organization property filter. + EXPECT_THAT(results.results(0).document(), + EqualsProto(person_document)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithWildcardPropertyFilters) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filters of sender.name and subject for the + // wildcard(*) schema type. + auto search_spec = std::make_unique(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* wildcard_property_filters = + search_spec->add_type_property_filters(); + wildcard_property_filters->set_schema_type("*"); + wildcard_property_filters->add_paths("sender.name"); + wildcard_property_filters->add_paths("subject"); + + auto result_spec = std::make_unique(); + + auto scoring_spec = std::make_unique(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the first document is returned since the second + // document doesn't contain the word 'hello' in either of fields specified in + // the property filter. This confirms that the property filters for the + // wildcard entry have been applied to the Email schema as well. + EXPECT_THAT(results.results(0).document(), + EqualsProto(document_one)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithMixedPropertyFilters) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filters of sender.name and subject for the + // wildcard(*) schema type plus property filters of sender.name and body for + // the Email schema type. + auto search_spec = std::make_unique(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* wildcard_property_filters = + search_spec->add_type_property_filters(); + wildcard_property_filters->set_schema_type("*"); + wildcard_property_filters->add_paths("sender.name"); + wildcard_property_filters->add_paths("subject"); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("sender.name"); + email_property_filters->add_paths("body"); + + auto result_spec = std::make_unique(); + + auto scoring_spec = std::make_unique(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the second document is returned since the first + // document doesn't contain the word 'hello' in either of fields sender.name + // or body. This confirms that the property filters specified for Email schema + // have been applied and the ones specified for wildcard entry have been + // ignored. + EXPECT_THAT(results.results(0).document(), + EqualsProto(document_two)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithNonApplicablePropertyFilters) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filters of sender.name and subject for an + // unknown schema type. + auto search_spec = std::make_unique(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("unknown"); + email_property_filters->add_paths("sender.name"); + email_property_filters->add_paths("subject"); + + auto result_spec = std::make_unique(); + + auto scoring_spec = std::make_unique(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); + + // 3. Verify that both the documents are returned since each of them have the + // word 'hello' in at least 1 property. The second document being returned + // confirms that the body field was searched and the specified property + // filters were not applied to the Email schema type. + EXPECT_THAT(results.results(0).document(), + EqualsProto(document_two)); + EXPECT_THAT(results.results(1).document(), + EqualsProto(document_one)); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithEmptyPropertyFilter) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Message") + .AddStringProperty("body", "Hello World!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + // 2. Issue a query with empty property filter for Message schema. + auto search_spec = std::make_unique(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* message_property_filters = + search_spec->add_type_property_filters(); + message_property_filters->set_schema_type("Message"); + + auto result_spec = std::make_unique(); + + auto scoring_spec = std::make_unique(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + + // 3. Verify that no documents are returned. Although 'hello' is present in + // the indexed document, it shouldn't be returned since the Message property + // filter doesn't allow any properties to be searched. + ASSERT_THAT(results.results(), IsEmpty()); +} + +TEST_P(IcingSearchEngineSearchTest, + SearchWithPropertyFilterHavingInvalidProperty) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Message") + .AddStringProperty("body", "Hello World!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + // 2. Issue a query with property filter having invalid/unknown property for + // Message schema. + auto search_spec = std::make_unique(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* message_property_filters = + search_spec->add_type_property_filters(); + message_property_filters->set_schema_type("Message"); + message_property_filters->add_paths("unknown"); + + auto result_spec = std::make_unique(); + + auto scoring_spec = std::make_unique(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + + // 3. Verify that no documents are returned. Although 'hello' is present in + // the indexed document, it shouldn't be returned since the Message property + // filter doesn't allow any valid properties to be searched. Any + // invalid/unknown properties specified in the property filters will be + // ignored while searching. + ASSERT_THAT(results.results(), IsEmpty()); +} + +TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFiltersWithNesting) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // 1. Add two email documents + DocumentProto document_one = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Meg Ryan") + .AddStringProperty("emailAddress", "hellogirl@aol.com") + .Build()) + .AddStringProperty("subject", "Hello World!") + .AddStringProperty( + "body", "Oh what a beautiful morning! Oh what a beautiful day!") + .Build(); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetCreationTimestampMs(1000) + .SetSchema("Email") + .AddDocumentProperty( + "sender", DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Person") + .AddStringProperty("name", "Tom Hanks") + .AddStringProperty("emailAddress", "ny152@aol.com") + .Build()) + .AddStringProperty("subject", "Goodnight Moon!") + .AddStringProperty("body", + "Count all the sheep and tell them 'Hello'.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // 2. Issue a query with property filter of sender.emailAddress for the Email + // schema type. + auto search_spec = std::make_unique(); + search_spec->set_term_match_type(TermMatchType::PREFIX); + search_spec->set_query("hello"); + search_spec->set_search_type(GetParam()); + TypePropertyMask* email_property_filters = + search_spec->add_type_property_filters(); + email_property_filters->set_schema_type("Email"); + email_property_filters->add_paths("sender.emailAddress"); + + auto result_spec = std::make_unique(); + + auto scoring_spec = std::make_unique(); + *scoring_spec = GetDefaultScoringSpec(); + SearchResultProto results = + icing.Search(*search_spec, *scoring_spec, *result_spec); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(1)); + + // 3. Verify that only the first document is returned since the second + // document doesn't contain the word 'hello' in sender.emailAddress. The first + // document being returned confirms that the nested property + // sender.emailAddress was actually searched. + EXPECT_THAT(results.results(0).document(), + EqualsProto(document_one)); +} + TEST_P(IcingSearchEngineSearchTest, QueryStatsProtoTest) { auto fake_clock = std::make_unique(); fake_clock->SetTimerElapsedMilliseconds(5); diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc index 227a185..b850a9b 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc @@ -42,8 +42,99 @@ DocHitInfoIteratorSectionRestrict::DocHitInfoIteratorSectionRestrict( : delegate_(std::move(delegate)), document_store_(*document_store), schema_store_(*schema_store), - target_sections_(std::move(target_sections)), - current_time_ms_(current_time_ms) {} + current_time_ms_(current_time_ms) { + type_property_filters_[std::string(SchemaStore::kSchemaTypeWildcard)] = + std::move(target_sections); +} + +DocHitInfoIteratorSectionRestrict::DocHitInfoIteratorSectionRestrict( + std::unique_ptr delegate, + const DocumentStore* document_store, const SchemaStore* schema_store, + const SearchSpecProto& search_spec, + int64_t current_time_ms) + : delegate_(std::move(delegate)), + document_store_(*document_store), + schema_store_(*schema_store), + current_time_ms_(current_time_ms) { + // TODO(b/294274922): Add support for polymorphism in type property filters. + for (const TypePropertyMask& type_property_mask : + search_spec.type_property_filters()) { + type_property_filters_[type_property_mask.schema_type()] = + std::set(type_property_mask.paths().begin(), + type_property_mask.paths().end()); + } +} + +DocHitInfoIteratorSectionRestrict::DocHitInfoIteratorSectionRestrict( + std::unique_ptr delegate, + const DocumentStore* document_store, const SchemaStore* schema_store, + std::unordered_map> + type_property_filters, + std::unordered_map type_property_masks, + int64_t current_time_ms) + : delegate_(std::move(delegate)), + document_store_(*document_store), + schema_store_(*schema_store), + current_time_ms_(current_time_ms), + type_property_filters_(std::move(type_property_filters)), + type_property_masks_(std::move(type_property_masks)) {} + +SectionIdMask DocHitInfoIteratorSectionRestrict::GenerateSectionMask( + const std::string& schema_type, + const std::set& target_sections) const { + SectionIdMask section_mask = kSectionIdMaskNone; + auto section_metadata_list_or = + schema_store_.GetSectionMetadata(schema_type); + if (!section_metadata_list_or.ok()) { + // The current schema doesn't have section metadata. + return kSectionIdMaskNone; + } + const std::vector* section_metadata_list = + section_metadata_list_or.ValueOrDie(); + for (const SectionMetadata& section_metadata : *section_metadata_list) { + if (target_sections.find(section_metadata.path) != + target_sections.end()) { + section_mask |= UINT64_C(1) << section_metadata.id; + } + } + return section_mask; +} + +SectionIdMask DocHitInfoIteratorSectionRestrict:: + ComputeAndCacheSchemaTypeAllowedSectionsMask( + const std::string& schema_type) { + if (const auto type_property_mask_itr = + type_property_masks_.find(schema_type); + type_property_mask_itr != type_property_masks_.end()) { + return type_property_mask_itr->second; + } + + // Section id mask of schema_type is never calculated before, so + // calculate it here and put it into type_property_masks_. + // - If type property filters of schema_type or wildcard (*) are + // specified, then create a mask according to the filters. + // - Otherwise, create a mask to match all properties. + SectionIdMask new_section_id_mask = kSectionIdMaskAll; + if (const auto itr = type_property_filters_.find(schema_type); + itr != type_property_filters_.end()) { + // Property filters defined for given schema type + new_section_id_mask = GenerateSectionMask( + schema_type, itr->second); + } else if (const auto wildcard_itr = type_property_filters_.find( + std::string(SchemaStore::kSchemaTypeWildcard)); + wildcard_itr != type_property_filters_.end()) { + // Property filters defined for wildcard entry + new_section_id_mask = GenerateSectionMask( + schema_type, wildcard_itr->second); + } else { + // Do not cache the section mask if no property filters apply to this schema + // type to avoid taking up unnecessary space. + return kSectionIdMaskAll; + } + + type_property_masks_[schema_type] = new_section_id_mask; + return new_section_id_mask; +} libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() { doc_hit_info_ = DocHitInfo(kInvalidDocumentId); @@ -63,32 +154,32 @@ libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() { // Guaranteed that the DocumentFilterData exists at this point SchemaTypeId schema_type_id = data_optional.value().schema_type_id(); - - // A hit can be in multiple sections at once, need to check which of the - // section ids match the target sections - while (section_id_mask != 0) { - // There was a hit in this section id - SectionId section_id = __builtin_ctzll(section_id_mask); - - auto section_metadata_or = - schema_store_.GetSectionMetadata(schema_type_id, section_id); - - if (section_metadata_or.ok()) { - const SectionMetadata* section_metadata = - section_metadata_or.ValueOrDie(); - - if (target_sections_.find(section_metadata->path) != - target_sections_.end()) { - // The hit was in the target section name, return OK/found - hit_intersect_section_ids_mask_ |= UINT64_C(1) << section_id; - } - } - - // Mark this section as checked - section_id_mask &= ~(UINT64_C(1) << section_id); + auto schema_type_or = schema_store_.GetSchemaType(schema_type_id); + if (!schema_type_or.ok()) { + // Ran into error retrieving schema type, skip + continue; } + const std::string* schema_type = std::move(schema_type_or).ValueOrDie(); + SectionIdMask allowed_sections_mask = + ComputeAndCacheSchemaTypeAllowedSectionsMask(*schema_type); - if (hit_intersect_section_ids_mask_ != kSectionIdMaskNone) { + // A hit can be in multiple sections at once, need to check which of the + // section ids match the sections allowed by type_property_masks_. This can + // be done by doing a bitwise and of the section_id_mask in the doc hit and + // the allowed_sections_mask. + hit_intersect_section_ids_mask_ = section_id_mask & allowed_sections_mask; + + // Return this document if: + // - the sectionIdMask is not empty after applying property filters, or + // - no property filters apply for its schema type (allowed_sections_mask + // == kSectionIdMaskAll). This is needed to ensure that in case of empty + // query (which uses doc-hit-info-iterator-all-document-id), where + // section_id_mask (and hence hit_intersect_section_ids_mask_) is + // kSectionIdMaskNone, doc hits with no property restrictions don't get + // filtered out. Doc hits for schema types for whom property filters are + // specified will still get filtered out. + if (allowed_sections_mask == kSectionIdMaskAll + || hit_intersect_section_ids_mask_ != kSectionIdMaskNone) { doc_hit_info_ = delegate_->doc_hit_info(); doc_hit_info_.set_hit_section_ids_mask(hit_intersect_section_ids_mask_); return libtextclassifier3::Status::OK; @@ -104,16 +195,36 @@ libtextclassifier3::StatusOr DocHitInfoIteratorSectionRestrict::TrimRightMostNode() && { ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_delegate, std::move(*delegate_).TrimRightMostNode()); + // TrimRightMostNode is only used by suggestion processor to process query + // expression, so an entry for wildcard should always be present in + // type_property_filters_ when code flow reaches here. If the InternalError + // below is returned, that means TrimRightMostNode hasn't been called in the + // right context. + const auto it = type_property_filters_.find("*"); + if (it == type_property_filters_.end()) { + return absl_ports::InternalError( + "A wildcard entry should always be present in type property filters " + "whenever TrimRightMostNode() is called for " + "DocHitInfoIteratorSectionRestrict"); + } + std::set& target_sections = it->second; + if (target_sections.empty()) { + return absl_ports::InternalError( + "Target sections should not be empty whenever TrimRightMostNode() is " + "called for DocHitInfoIteratorSectionRestrict"); + } if (trimmed_delegate.iterator_ == nullptr) { // TODO(b/228240987): Update TrimmedNode and downstream code to handle // multiple section restricts. - trimmed_delegate.target_section_ = std::move(*target_sections_.begin()); + trimmed_delegate.target_section_ = std::move(*target_sections.begin()); return trimmed_delegate; } trimmed_delegate.iterator_ = - std::make_unique( + std::unique_ptr( + new DocHitInfoIteratorSectionRestrict( std::move(trimmed_delegate.iterator_), &document_store_, - &schema_store_, std::move(target_sections_), current_time_ms_); + &schema_store_, std::move(type_property_filters_), + std::move(type_property_masks_), current_time_ms_)); return std::move(trimmed_delegate); } @@ -126,8 +237,14 @@ int32_t DocHitInfoIteratorSectionRestrict::GetNumLeafAdvanceCalls() const { } std::string DocHitInfoIteratorSectionRestrict::ToString() const { - return absl_ports::StrCat("(", absl_ports::StrJoin(target_sections_, ","), - "): ", delegate_->ToString()); + std::string output = ""; + for (auto it = type_property_filters_.cbegin(); + it != type_property_filters_.cend(); it++) { + std::string paths = absl_ports::StrJoin(it->second, ","); + output += (it->first) + ":" + (paths) + "; "; + } + std::string result = "{" + output.substr(0, output.size() - 2) + "}: "; + return absl_ports::StrCat(result, delegate_->ToString()); } } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h index 58dd120..5d44ed7 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h @@ -19,10 +19,13 @@ #include #include #include +#include #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-filter-data.h" #include "icing/store/document-store.h" namespace icing { @@ -44,6 +47,12 @@ class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator { const DocumentStore* document_store, const SchemaStore* schema_store, std::set target_sections, int64_t current_time_ms); + explicit DocHitInfoIteratorSectionRestrict( + std::unique_ptr delegate, + const DocumentStore* document_store, const SchemaStore* schema_store, + const SearchSpecProto& search_spec, + int64_t current_time_ms); + libtextclassifier3::Status Advance() override; libtextclassifier3::StatusOr TrimRightMostNode() && override; @@ -72,12 +81,51 @@ class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator { } private: + explicit DocHitInfoIteratorSectionRestrict( + std::unique_ptr delegate, + const DocumentStore* document_store, const SchemaStore* schema_store, + std::unordered_map> + type_property_filters, + std::unordered_map type_property_masks, + int64_t current_time_ms); + // Calculates the section mask of allowed sections(determined by the property + // filters map) for the given schema type and caches the same for any future + // calls. + // + // Returns: + // - If type_property_filters_ has an entry for the given schema type or + // wildcard(*), return a bitwise or of section IDs in the schema type that + // that are also present in the relevant filter list. + // - Otherwise, return kSectionIdMaskAll. + SectionIdMask ComputeAndCacheSchemaTypeAllowedSectionsMask( + const std::string& schema_type); + // Generates a section mask for the given schema type and the target sections. + // + // Returns: + // - A bitwise or of section IDs in the schema_type that that are also + // present in the target_sections list. + // - If none of the sections in the schema_type are present in the + // target_sections list, return kSectionIdMaskNone. + // This is done by doing a bitwise or of the target section ids for the given + // schema type. + SectionIdMask GenerateSectionMask(const std::string& schema_type, + const std::set& + target_sections) const; + std::unique_ptr delegate_; const DocumentStore& document_store_; const SchemaStore& schema_store_; - - std::set target_sections_; int64_t current_time_ms_; + + // Map of property filters per schema type. Supports wildcard(*) for schema + // type that will apply to all schema types that are not specifically + // specified in the mapping otherwise. + std::unordered_map> + type_property_filters_; + // Mapping of schema type to the section mask of allowed sections for that + // schema type. This section mask is lazily calculated based on the specified + // property filters and cached for any future use. + std::unordered_map type_property_masks_; }; } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index 47f55aa..1500571 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -186,7 +186,7 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, EmptyOriginalIterator) { DocHitInfoIteratorSectionRestrict filtered_iterator( std::move(original_iterator_empty), document_store_.get(), - schema_store_.get(), /*target_sections=*/{}, + schema_store_.get(), /*target_sections=*/std::set(), fake_clock_.GetSystemTimeMilliseconds()); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); @@ -391,7 +391,7 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, // Create a hit that exists in a different section, so it shouldn't match any // section filters std::vector doc_hit_infos = { - DocHitInfo(document_id, kSectionIdMaskNone << not_matching_section_id)}; + DocHitInfo(document_id, UINT64_C(1) << not_matching_section_id)}; std::unique_ptr original_iterator = std::make_unique(doc_hit_infos); diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc index 5e0b696..3e43ad9 100644 --- a/icing/query/query-processor.cc +++ b/icing/query/query-processor.cc @@ -176,6 +176,13 @@ libtextclassifier3::StatusOr QueryProcessor::ParseSearch( results.root_iterator = std::make_unique( std::move(results.root_iterator), &document_store_, &schema_store_, options, current_time_ms); + // TODO(b/294114230): Move this SectionRestrict filter from root level to + // lower levels if that would improve performance. + if (!search_spec.type_property_filters().empty()) { + results.root_iterator = std::make_unique( + std::move(results.root_iterator), &document_store_, &schema_store_, + search_spec, current_time_ms); + } return results; } diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index 5bbf9ba..fa70de1 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -2646,6 +2646,261 @@ TEST_P(QueryProcessorTest, PropertyFilterTermAndUnrestrictedTerm) { EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal")); } +TEST_P(QueryProcessorTest, TypePropertyFilter) { + // Create the schema and document store + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("message") + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + // SectionIds are assigned in ascending order per schema type, + // alphabetically. + int email_bar_section_id = 0; + int email_baz_section_id = 1; + int email_foo_section_id = 2; + int message_bar_section_id = 0; + int message_baz_section_id = 1; + int message_foo_section_id = 2; + ASSERT_THAT(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false), + IsOk()); + + // These documents don't actually match to the tokens in the index. We're + // inserting the documents to get the appropriate number of documents and + // schema types populated. + ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("email") + .Build())); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "2") + .SetSchema("message") + .Build())); + + // Poplate the index + TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY; + + // Email document has content "animal" in all sections + ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(email_document_id, email_bar_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(email_document_id, email_baz_section_id, + term_match_type, "animal"), + IsOk()); + + // Message document has content "animal" in all sections + ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(message_document_id, message_bar_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(message_document_id, message_baz_section_id, + term_match_type, "animal"), + IsOk()); + + SearchSpecProto search_spec; + search_spec.set_query("animal"); + search_spec.set_term_match_type(term_match_type); + search_spec.set_search_type(GetParam()); + + // email has property filters for foo and baz properties + TypePropertyMask *email_mask = search_spec.add_type_property_filters(); + email_mask->set_schema_type("email"); + email_mask->add_paths("foo"); + email_mask->add_paths("baz"); + + // message has property filters for bar and baz properties + TypePropertyMask *message_mask = search_spec.add_type_property_filters(); + message_mask->set_schema_type("message"); + message_mask->add_paths("bar"); + message_mask->add_paths("baz"); + + ICING_ASSERT_OK_AND_ASSIGN( + QueryResults results, + query_processor_->ParseSearch( + search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, + fake_clock_.GetSystemTimeMilliseconds())); + + // Ordered by descending DocumentId, so message comes first since it was + // inserted last + DocHitInfo expected_doc_hit_info1(message_document_id); + expected_doc_hit_info1.UpdateSection(message_bar_section_id); + expected_doc_hit_info1.UpdateSection(message_baz_section_id); + DocHitInfo expected_doc_hit_info2(email_document_id); + expected_doc_hit_info2.UpdateSection(email_foo_section_id); + expected_doc_hit_info2.UpdateSection(email_baz_section_id); + EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), + ElementsAre(expected_doc_hit_info1, expected_doc_hit_info2)); + EXPECT_THAT(results.query_term_iterators, SizeIs(1)); + + EXPECT_THAT(results.query_terms, SizeIs(1)); + EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal")); +} + +TEST_P(QueryProcessorTest, TypePropertyFilterWithSectionRestrict) { + // Create the schema and document store + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("message") + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("baz") + .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + // SectionIds are assigned in ascending order per schema type, + // alphabetically. + int email_bar_section_id = 0; + int email_baz_section_id = 1; + int email_foo_section_id = 2; + int message_bar_section_id = 0; + int message_baz_section_id = 1; + int message_foo_section_id = 2; + ASSERT_THAT(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false), + IsOk()); + + // These documents don't actually match to the tokens in the index. We're + // inserting the documents to get the appropriate number of documents and + // schema types populated. + ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("email") + .Build())); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "2") + .SetSchema("message") + .Build())); + + // Poplate the index + TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY; + + // Email document has content "animal" in all sections + ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(email_document_id, email_bar_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(email_document_id, email_baz_section_id, + term_match_type, "animal"), + IsOk()); + + // Message document has content "animal" in all sections + ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(message_document_id, message_bar_section_id, + term_match_type, "animal"), + IsOk()); + ASSERT_THAT(AddTokenToIndex(message_document_id, message_baz_section_id, + term_match_type, "animal"), + IsOk()); + + SearchSpecProto search_spec; + // Create a section filter '
:' + search_spec.set_query("foo:animal"); + search_spec.set_term_match_type(term_match_type); + search_spec.set_search_type(GetParam()); + + // email has property filters for foo and baz properties + TypePropertyMask *email_mask = search_spec.add_type_property_filters(); + email_mask->set_schema_type("email"); + email_mask->add_paths("foo"); + email_mask->add_paths("baz"); + + // message has property filters for bar and baz properties + TypePropertyMask *message_mask = search_spec.add_type_property_filters(); + message_mask->set_schema_type("message"); + message_mask->add_paths("bar"); + message_mask->add_paths("baz"); + + ICING_ASSERT_OK_AND_ASSIGN( + QueryResults results, + query_processor_->ParseSearch( + search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, + fake_clock_.GetSystemTimeMilliseconds())); + + // Only hits in sections allowed by both the property filters and section + // restricts should be returned. Message document should not be returned since + // section foo specified in the section restrict is not allowed by the + // property filters. + DocHitInfo expected_doc_hit_info(email_document_id); + expected_doc_hit_info.UpdateSection(email_foo_section_id); + EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), + ElementsAre(expected_doc_hit_info)); + EXPECT_THAT(results.query_term_iterators, SizeIs(1)); + + EXPECT_THAT(results.query_terms, SizeIs(1)); + EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal")); +} + TEST_P(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) { // Create the schema and document store SchemaProto schema = SchemaBuilder() diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index bcacdce..a389d13 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -780,6 +780,17 @@ libtextclassifier3::StatusOr SchemaStore::GetSchemaTypeId( return schema_type_mapper_->Get(schema_type); } +libtextclassifier3::StatusOr SchemaStore::GetSchemaType( + SchemaTypeId schema_type_id) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); + if (const auto it = reverse_schema_type_mapper_.find(schema_type_id); + it == reverse_schema_type_mapper_.end()) { + return absl_ports::InvalidArgumentError("Invalid schema type id"); + } else { + return &it->second; + } +} + libtextclassifier3::StatusOr*> SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const { ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id, diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index 6075f5b..88968b1 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -276,6 +276,15 @@ class SchemaStore { libtextclassifier3::StatusOr GetSchemaTypeConfig(std::string_view schema_type) const; + // Returns the schema type of the passed in SchemaTypeId + // + // Returns: + // schema type on success + // FAILED_PRECONDITION if schema hasn't been set yet + // INVALID_ARGUMENT if schema type id is invalid + libtextclassifier3::StatusOr GetSchemaType( + SchemaTypeId schema_type_id) const; + // Returns the SchemaTypeId of the passed in schema type // // Returns: diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index 411ad3e..7f4fb3e 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -27,7 +27,7 @@ option java_multiple_files = true; option objc_class_prefix = "ICNG"; // Client-supplied specifications on what documents to retrieve. -// Next tag: 10 +// Next tag: 11 message SearchSpecProto { // REQUIRED: The "raw" query string that users may type. For example, "cat" // will search for documents with the term cat in it. @@ -102,6 +102,16 @@ message SearchSpecProto { // Finer-grained locks are implemented around code paths that write changes to // Icing during Search. optional bool use_read_only_search = 9 [default = true]; + + // TODO(b/294266822): Handle multiple property filter lists for same schema + // type. + // How to specify a subset of properties to be searched. If no type property + // filter has been specified for a schema type (no TypePropertyMask for the + // given schema type), then *all* properties of that schema type will be + // searched. If an empty property filter is specified for a given schema type + // (TypePropertyMask for the given schema type has empty paths field), no + // properties of that schema type will be searched. + repeated TypePropertyMask type_property_filters = 10; } // Client-supplied specifications on what to include/how to format the search diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 58918d3..3ec2c2d 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=548180296) +set(synced_AOSP_CL_number=553570527) -- cgit v1.2.3 From 8c71e61d02944611249c892236e67c6acace8a2d Mon Sep 17 00:00:00 2001 From: Tim Barron Date: Wed, 23 Aug 2023 15:46:45 -0700 Subject: Update Icing from upstream. Descriptions: ======================================================================== Fix term frequency bug ======================================================================== Delete dead JNI functions. ======================================================================== Switch Icing JNI implementation to use RegisterNatives ======================================================================== Avoid unnecessary GetObjectClass and GetFieldID calls. ======================================================================== Bug: 296938196 Change-Id: I773bcc05e768a7ac80f1950985b618047d009641 --- .../index/main/doc-hit-info-iterator-term-main.cc | 86 ++-- icing/index/main/doc-hit-info-iterator-term-main.h | 37 +- icing/index/main/main-index_test.cc | 64 ++- icing/index/numeric/dummy-numeric-index.h | 18 +- icing/jni/icing-search-engine-jni.cc | 465 +++++++-------------- synced_AOSP_CL_number.txt | 2 +- 6 files changed, 292 insertions(+), 380 deletions(-) diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc index 8f0d3f5..5cf6a4c 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.cc +++ b/icing/index/main/doc-hit-info-iterator-term-main.cc @@ -14,16 +14,20 @@ #include "icing/index/main/doc-hit-info-iterator-term-main.h" -#include #include +#include +#include +#include +#include #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" -#include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/main/main-index.h" #include "icing/index/main/posting-list-hit-accessor.h" -#include "icing/legacy/core/icing-string-util.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/util/logging.h" @@ -44,6 +48,30 @@ std::string SectionIdMaskToString(SectionIdMask section_id_mask) { return mask; } +void MergeNewHitIntoCachedDocHitInfos( + const Hit& hit, bool need_hit_term_frequency, + std::vector& + cached_doc_hit_infos_out) { + if (cached_doc_hit_infos_out.empty() || + hit.document_id() != + cached_doc_hit_infos_out.back().doc_hit_info.document_id()) { + std::optional tf_arr; + if (need_hit_term_frequency) { + tf_arr = std::make_optional(); + } + + cached_doc_hit_infos_out.push_back( + DocHitInfoIteratorTermMain::DocHitInfoAndTermFrequencyArray( + DocHitInfo(hit.document_id()), std::move(tf_arr))); + } + + cached_doc_hit_infos_out.back().doc_hit_info.UpdateSection(hit.section_id()); + if (need_hit_term_frequency) { + (*cached_doc_hit_infos_out.back().term_frequency_array)[hit.section_id()] = + hit.term_frequency(); + } +} + } // namespace libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() { @@ -76,7 +104,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() { return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } - doc_hit_info_ = cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_); + doc_hit_info_ = + cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_).doc_hit_info; hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask(); return libtextclassifier3::Status::OK; } @@ -90,16 +119,16 @@ DocHitInfoIteratorTermMain::TrimRightMostNode() && { } libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() { - DocHitInfo last_doc_hit_info; + DocHitInfoAndTermFrequencyArray last_doc_hit_info; if (!cached_doc_hit_infos_.empty()) { - last_doc_hit_info = cached_doc_hit_infos_.back(); + last_doc_hit_info = std::move(cached_doc_hit_infos_.back()); } cached_doc_hit_infos_idx_ = 0; cached_doc_hit_infos_.clear(); - if (last_doc_hit_info.document_id() != kInvalidDocumentId) { + if (last_doc_hit_info.doc_hit_info.document_id() != kInvalidDocumentId) { // Carry over the last hit. It might need to be merged with the first hit of // of the next posting list in the chain. - cached_doc_hit_infos_.push_back(last_doc_hit_info); + cached_doc_hit_infos_.push_back(std::move(last_doc_hit_info)); } if (posting_list_accessor_ == nullptr) { ICING_ASSIGN_OR_RETURN(posting_list_accessor_, @@ -112,8 +141,7 @@ libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() { all_pages_consumed_ = true; } ++num_blocks_inspected_; - cached_doc_hit_infos_.reserve(hits.size() + 1); - cached_hit_term_frequency_.reserve(hits.size() + 1); + cached_doc_hit_infos_.reserve(cached_doc_hit_infos_.size() + hits.size()); for (const Hit& hit : hits) { // Check sections. if (((UINT64_C(1) << hit.section_id()) & section_restrict_mask_) == 0) { @@ -123,13 +151,9 @@ libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() { if (hit.is_prefix_hit()) { continue; } - if (cached_doc_hit_infos_.empty() || - hit.document_id() != cached_doc_hit_infos_.back().document_id()) { - cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id())); - cached_hit_term_frequency_.push_back(Hit::TermFrequencyArray()); - } - cached_doc_hit_infos_.back().UpdateSection(hit.section_id()); - cached_hit_term_frequency_.back()[hit.section_id()] = hit.term_frequency(); + + MergeNewHitIntoCachedDocHitInfos(hit, need_hit_term_frequency_, + cached_doc_hit_infos_); } return libtextclassifier3::Status::OK; } @@ -141,16 +165,16 @@ std::string DocHitInfoIteratorTermMainExact::ToString() const { libtextclassifier3::Status DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() { - DocHitInfo last_doc_hit_info; + DocHitInfoAndTermFrequencyArray last_doc_hit_info; if (!cached_doc_hit_infos_.empty()) { - last_doc_hit_info = cached_doc_hit_infos_.back(); + last_doc_hit_info = std::move(cached_doc_hit_infos_.back()); } cached_doc_hit_infos_idx_ = 0; cached_doc_hit_infos_.clear(); - if (last_doc_hit_info.document_id() != kInvalidDocumentId) { + if (last_doc_hit_info.doc_hit_info.document_id() != kInvalidDocumentId) { // Carry over the last hit. It might need to be merged with the first hit of // of the next posting list in the chain. - cached_doc_hit_infos_.push_back(last_doc_hit_info); + cached_doc_hit_infos_.push_back(std::move(last_doc_hit_info)); } ++num_blocks_inspected_; @@ -165,10 +189,7 @@ DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() { if (hits.empty()) { all_pages_consumed_ = true; } - cached_doc_hit_infos_.reserve(hits.size()); - if (need_hit_term_frequency_) { - cached_hit_term_frequency_.reserve(hits.size()); - } + cached_doc_hit_infos_.reserve(cached_doc_hit_infos_.size() + hits.size()); for (const Hit& hit : hits) { // Check sections. if (((UINT64_C(1) << hit.section_id()) & section_restrict_mask_) == 0) { @@ -178,18 +199,9 @@ DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() { if (!exact_ && !hit.is_in_prefix_section()) { continue; } - if (cached_doc_hit_infos_.empty() || - hit.document_id() != cached_doc_hit_infos_.back().document_id()) { - cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id())); - if (need_hit_term_frequency_) { - cached_hit_term_frequency_.push_back(Hit::TermFrequencyArray()); - } - } - cached_doc_hit_infos_.back().UpdateSection(hit.section_id()); - if (need_hit_term_frequency_) { - cached_hit_term_frequency_.back()[hit.section_id()] = - hit.term_frequency(); - } + + MergeNewHitIntoCachedDocHitInfos(hit, need_hit_term_frequency_, + cached_doc_hit_infos_); } return libtextclassifier3::Status::OK; } diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h index 08a385c..1987e12 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.h +++ b/icing/index/main/doc-hit-info-iterator-term-main.h @@ -17,10 +17,14 @@ #include #include +#include +#include +#include #include #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/main/main-index.h" #include "icing/index/main/posting-list-hit-accessor.h" @@ -31,6 +35,19 @@ namespace lib { class DocHitInfoIteratorTermMain : public DocHitInfoIterator { public: + struct DocHitInfoAndTermFrequencyArray { + DocHitInfo doc_hit_info; + std::optional term_frequency_array; + + explicit DocHitInfoAndTermFrequencyArray() = default; + + explicit DocHitInfoAndTermFrequencyArray( + DocHitInfo doc_hit_info_in, + std::optional term_frequency_array_in) + : doc_hit_info(std::move(doc_hit_info_in)), + term_frequency_array(std::move(term_frequency_array_in)) {} + }; + explicit DocHitInfoIteratorTermMain(MainIndex* main_index, const std::string& term, int term_start_index, @@ -74,8 +91,9 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator { while (section_mask_copy) { SectionId section_id = __builtin_ctzll(section_mask_copy); if (need_hit_term_frequency_) { - section_term_frequencies.at(section_id) = cached_hit_term_frequency_.at( - cached_doc_hit_infos_idx_)[section_id]; + section_term_frequencies.at(section_id) = + (*cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_) + .term_frequency_array)[section_id]; } section_mask_copy &= ~(UINT64_C(1) << section_id); } @@ -106,12 +124,13 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator { std::unique_ptr posting_list_accessor_; MainIndex* main_index_; - // Stores hits retrieved from the index. This may only be a subset of the hits - // that are present in the index. Current value pointed to by the Iterator is - // tracked by cached_doc_hit_infos_idx_. - std::vector cached_doc_hit_infos_; - std::vector cached_hit_term_frequency_; + // Stores hits and optional term frequency arrays retrieved from the index. + // This may only be a subset of the hits that are present in the index. + // Current value pointed to by the Iterator is tracked by + // cached_doc_hit_infos_idx_. + std::vector cached_doc_hit_infos_; int cached_doc_hit_infos_idx_; + int num_advance_calls_; int num_blocks_inspected_; bool all_pages_consumed_; @@ -168,10 +187,6 @@ class DocHitInfoIteratorTermMainPrefix : public DocHitInfoIteratorTermMain { libtextclassifier3::Status RetrieveMoreHits() override; private: - // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and - // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be - // merged. - void SortAndDedupeDocumentIds(); // Whether or not posting_list_accessor_ holds a posting list chain for // 'term' or for a term for which 'term' is a prefix. This is necessary to // determine whether to return hits that are not from a prefix section (hits diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index ac724b0..c59a3c8 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -38,6 +38,7 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::Eq; using ::testing::IsEmpty; using ::testing::NiceMock; using ::testing::Return; @@ -531,30 +532,35 @@ TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) { std::vector{doc1_hit.section_id()}))); } -TEST_F(MainIndexTest, SearchChainedPostingLists) { +TEST_F(MainIndexTest, + SearchChainedPostingListsShouldMergeSectionsAndTermFrequency) { // Index 2048 document with 3 hits in each document. When merged into the main // index, this will 1) lead to a chained posting list and 2) split at least // one document's hits across multiple posting lists. + const std::string term = "foot"; + ICING_ASSERT_OK_AND_ASSIGN( uint32_t tvi, - lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0)); + lite_index_->InsertTerm(term, TermMatchType::EXACT_ONLY, kNamespace0)); ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); for (DocumentId document_id = 0; document_id < 2048; ++document_id) { - Hit doc_hit0(/*section_id=*/0, /*document_id=*/document_id, - Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + Hit::TermFrequency term_frequency = static_cast( + document_id % Hit::kMaxTermFrequency + 1); + Hit doc_hit0( + /*section_id=*/0, /*document_id=*/document_id, term_frequency, + /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0)); - Hit doc_hit1(/*section_id=*/1, /*document_id=*/document_id, - Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + Hit doc_hit1( + /*section_id=*/1, /*document_id=*/document_id, term_frequency, + /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1)); - Hit doc_hit2(/*section_id=*/2, /*document_id=*/document_id, - Hit::kDefaultTermFrequency, - /*is_in_prefix_section=*/false); + Hit doc_hit2( + /*section_id=*/2, /*document_id=*/document_id, term_frequency, + /*is_in_prefix_section=*/false); ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2)); } @@ -568,15 +574,35 @@ TEST_F(MainIndexTest, SearchChainedPostingLists) { // 3. Merge the lite index. ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get())); // Get hits for all documents containing "foot" - which should be all of them. - std::vector hits = - GetExactHits(main_index.get(), /*term_start_index=*/0, - /*unnormalized_term_length=*/0, "foot"); - EXPECT_THAT(hits, SizeIs(2048)); - EXPECT_THAT(hits.front(), - EqualsDocHitInfo(2047, std::vector{0, 1, 2})); - EXPECT_THAT(hits.back(), - EqualsDocHitInfo(0, std::vector{0, 1, 2})); + auto iterator = std::make_unique( + main_index.get(), term, /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + /*need_hit_term_frequency=*/true); + + DocumentId expected_document_id = 2047; + while (iterator->Advance().ok()) { + EXPECT_THAT(iterator->doc_hit_info(), + EqualsDocHitInfo(expected_document_id, + std::vector{0, 1, 2})); + + std::vector matched_terms_stats; + iterator->PopulateMatchedTermsStats(&matched_terms_stats); + + Hit::TermFrequency expected_term_frequency = + static_cast( + expected_document_id % Hit::kMaxTermFrequency + 1); + ASSERT_THAT(matched_terms_stats, SizeIs(1)); + EXPECT_THAT(matched_terms_stats[0].term, Eq(term)); + EXPECT_THAT(matched_terms_stats[0].term_frequencies[0], + Eq(expected_term_frequency)); + EXPECT_THAT(matched_terms_stats[0].term_frequencies[1], + Eq(expected_term_frequency)); + EXPECT_THAT(matched_terms_stats[0].term_frequencies[2], + Eq(expected_term_frequency)); + --expected_document_id; + } + EXPECT_THAT(expected_document_id, Eq(-1)); } TEST_F(MainIndexTest, MergeIndexBackfilling) { diff --git a/icing/index/numeric/dummy-numeric-index.h b/icing/index/numeric/dummy-numeric-index.h index 34b1acb..ce5fa45 100644 --- a/icing/index/numeric/dummy-numeric-index.h +++ b/icing/index/numeric/dummy-numeric-index.h @@ -183,7 +183,11 @@ class DummyNumericIndex : public NumericIndex { std::string&& working_path) : NumericIndex(filesystem, std::move(working_path), PersistentStorage::WorkingPathType::kDummy), - last_added_document_id_(kInvalidDocumentId) {} + dummy_crcs_buffer_( + std::make_unique(sizeof(PersistentStorage::Crcs))), + last_added_document_id_(kInvalidDocumentId) { + memset(dummy_crcs_buffer_.get(), 0, sizeof(PersistentStorage::Crcs)); + } libtextclassifier3::Status PersistStoragesToDisk(bool force) override { return libtextclassifier3::Status::OK; @@ -202,11 +206,17 @@ class DummyNumericIndex : public NumericIndex { return Crc32(0); } - PersistentStorage::Crcs& crcs() override { return dummy_crcs_; } - const PersistentStorage::Crcs& crcs() const override { return dummy_crcs_; } + PersistentStorage::Crcs& crcs() override { + return *reinterpret_cast( + dummy_crcs_buffer_.get()); + } + const PersistentStorage::Crcs& crcs() const override { + return *reinterpret_cast( + dummy_crcs_buffer_.get()); + } std::unordered_map>> storage_; - PersistentStorage::Crcs dummy_crcs_; + std::unique_ptr dummy_crcs_buffer_; DocumentId last_added_document_id_; }; diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc index f2a33e0..a0883fa 100644 --- a/icing/jni/icing-search-engine-jni.cc +++ b/icing/jni/icing-search-engine-jni.cc @@ -36,10 +36,6 @@ namespace { -// JNI string constants -// Matches field name of IcingSearchEngine#nativePointer. -const char kNativePointerField[] = "nativePointer"; - bool ParseProtoFromJniByteArray(JNIEnv* env, jbyteArray bytes, google::protobuf::MessageLite* protobuf) { icing::lib::ScopedPrimitiveArrayCritical scoped_array(env, bytes); @@ -61,11 +57,14 @@ jbyteArray SerializeProtoToJniByteArray(JNIEnv* env, return ret; } +struct { + jfieldID native_pointer; +} JavaIcingSearchEngineImpl; + icing::lib::IcingSearchEngine* GetIcingSearchEnginePointer(JNIEnv* env, jobject object) { - jclass cls = env->GetObjectClass(object); - jfieldID field_id = env->GetFieldID(cls, kNativePointerField, "J"); - jlong native_pointer = env->GetLongField(object, field_id); + jlong native_pointer = + env->GetLongField(object, JavaIcingSearchEngineImpl.native_pointer); return reinterpret_cast(native_pointer); } @@ -73,19 +72,8 @@ icing::lib::IcingSearchEngine* GetIcingSearchEnginePointer(JNIEnv* env, extern "C" { -jint JNI_OnLoad(JavaVM* vm, void* reserved) { - JNIEnv* env; - if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6) != JNI_OK) { - ICING_LOG(icing::lib::ERROR) << "ERROR: GetEnv failed"; - return JNI_ERR; - } - - return JNI_VERSION_1_6; -} - -JNIEXPORT jlong JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeCreate( - JNIEnv* env, jclass clazz, jbyteArray icing_search_engine_options_bytes) { +jlong nativeCreate(JNIEnv* env, jclass clazz, + jbyteArray icing_search_engine_options_bytes) { icing::lib::IcingSearchEngineOptions options; if (!ParseProtoFromJniByteArray(env, icing_search_engine_options_bytes, &options)) { @@ -103,17 +91,13 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeCreate( return reinterpret_cast(icing); } -JNIEXPORT void JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDestroy( - JNIEnv* env, jclass clazz, jobject object) { +void nativeDestroy(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); delete icing; } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeInitialize( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeInitialize(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -123,10 +107,9 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeInitialize( return SerializeProtoToJniByteArray(env, initialize_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetSchema( - JNIEnv* env, jclass clazz, jobject object, jbyteArray schema_bytes, - jboolean ignore_errors_and_delete_documents) { +jbyteArray nativeSetSchema(JNIEnv* env, jclass clazz, jobject object, + jbyteArray schema_bytes, + jboolean ignore_errors_and_delete_documents) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -143,9 +126,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetSchema( return SerializeProtoToJniByteArray(env, set_schema_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchema( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeGetSchema(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -154,9 +135,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchema( return SerializeProtoToJniByteArray(env, get_schema_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchemaType( - JNIEnv* env, jclass clazz, jobject object, jstring schema_type) { +jbyteArray nativeGetSchemaType(JNIEnv* env, jclass clazz, jobject object, + jstring schema_type) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -167,9 +147,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchemaType( return SerializeProtoToJniByteArray(env, get_schema_type_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativePut( - JNIEnv* env, jclass clazz, jobject object, jbyteArray document_bytes) { +jbyteArray nativePut(JNIEnv* env, jclass clazz, jobject object, + jbyteArray document_bytes) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -186,10 +165,9 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativePut( return SerializeProtoToJniByteArray(env, put_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGet( - JNIEnv* env, jclass clazz, jobject object, jstring name_space, jstring uri, - jbyteArray result_spec_bytes) { +jbyteArray nativeGet(JNIEnv* env, jclass clazz, jobject object, + jstring name_space, jstring uri, + jbyteArray result_spec_bytes) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -208,9 +186,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGet( return SerializeProtoToJniByteArray(env, get_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeReportUsage( - JNIEnv* env, jclass clazz, jobject object, jbyteArray usage_report_bytes) { +jbyteArray nativeReportUsage(JNIEnv* env, jclass clazz, jobject object, + jbyteArray usage_report_bytes) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -227,9 +204,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeReportUsage( return SerializeProtoToJniByteArray(env, report_usage_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetAllNamespaces( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeGetAllNamespaces(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -239,10 +214,9 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetAllNamespaces( return SerializeProtoToJniByteArray(env, get_all_namespaces_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetNextPage( - JNIEnv* env, jclass clazz, jobject object, jlong next_page_token, - jlong java_to_native_start_timestamp_ms) { +jbyteArray nativeGetNextPage(JNIEnv* env, jclass clazz, jobject object, + jlong next_page_token, + jlong java_to_native_start_timestamp_ms) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -263,9 +237,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetNextPage( return SerializeProtoToJniByteArray(env, next_page_result_proto); } -JNIEXPORT void JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeInvalidateNextPageToken( - JNIEnv* env, jclass clazz, jobject object, jlong next_page_token) { +void nativeInvalidateNextPageToken(JNIEnv* env, jclass clazz, jobject object, + jlong next_page_token) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -274,11 +247,11 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeInvalidateNextPageToke return; } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearch( - JNIEnv* env, jclass clazz, jobject object, jbyteArray search_spec_bytes, - jbyteArray scoring_spec_bytes, jbyteArray result_spec_bytes, - jlong java_to_native_start_timestamp_ms) { +jbyteArray nativeSearch(JNIEnv* env, jclass clazz, jobject object, + jbyteArray search_spec_bytes, + jbyteArray scoring_spec_bytes, + jbyteArray result_spec_bytes, + jlong java_to_native_start_timestamp_ms) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -321,10 +294,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearch( return SerializeProtoToJniByteArray(env, search_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDelete( - JNIEnv* env, jclass clazz, jobject object, jstring name_space, - jstring uri) { +jbyteArray nativeDelete(JNIEnv* env, jclass clazz, jobject object, + jstring name_space, jstring uri) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -336,9 +307,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeDelete( return SerializeProtoToJniByteArray(env, delete_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByNamespace( - JNIEnv* env, jclass clazz, jobject object, jstring name_space) { +jbyteArray nativeDeleteByNamespace(JNIEnv* env, jclass clazz, jobject object, + jstring name_space) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -349,9 +319,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByNamespace( return SerializeProtoToJniByteArray(env, delete_by_namespace_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteBySchemaType( - JNIEnv* env, jclass clazz, jobject object, jstring schema_type) { +jbyteArray nativeDeleteBySchemaType(JNIEnv* env, jclass clazz, jobject object, + jstring schema_type) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -362,10 +331,9 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteBySchemaType( return SerializeProtoToJniByteArray(env, delete_by_schema_type_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByQuery( - JNIEnv* env, jclass clazz, jobject object, jbyteArray search_spec_bytes, - jboolean return_deleted_document_info) { +jbyteArray nativeDeleteByQuery(JNIEnv* env, jclass clazz, jobject object, + jbyteArray search_spec_bytes, + jboolean return_deleted_document_info) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -381,9 +349,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByQuery( return SerializeProtoToJniByteArray(env, delete_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativePersistToDisk( - JNIEnv* env, jclass clazz, jobject object, jint persist_type_code) { +jbyteArray nativePersistToDisk(JNIEnv* env, jclass clazz, jobject object, + jint persist_type_code) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -400,9 +367,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativePersistToDisk( return SerializeProtoToJniByteArray(env, persist_to_disk_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeOptimize( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeOptimize(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -411,9 +376,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeOptimize( return SerializeProtoToJniByteArray(env, optimize_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetOptimizeInfo( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeGetOptimizeInfo(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -423,9 +386,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetOptimizeInfo( return SerializeProtoToJniByteArray(env, get_optimize_info_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetStorageInfo( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeGetStorageInfo(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -435,9 +396,7 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetStorageInfo( return SerializeProtoToJniByteArray(env, storage_info_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeReset( - JNIEnv* env, jclass clazz, jobject object) { +jbyteArray nativeReset(JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -446,10 +405,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeReset( return SerializeProtoToJniByteArray(env, reset_result_proto); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearchSuggestions( - JNIEnv* env, jclass clazz, jobject object, - jbyteArray suggestion_spec_bytes) { +jbyteArray nativeSearchSuggestions(JNIEnv* env, jclass clazz, jobject object, + jbyteArray suggestion_spec_bytes) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -466,9 +423,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearchSuggestions( return SerializeProtoToJniByteArray(env, suggestionResponse); } -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetDebugInfo( - JNIEnv* env, jclass clazz, jobject object, jint verbosity) { +jbyteArray nativeGetDebugInfo(JNIEnv* env, jclass clazz, jobject object, + jint verbosity) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); @@ -485,9 +441,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetDebugInfo( return SerializeProtoToJniByteArray(env, debug_info_result_proto); } -JNIEXPORT jboolean JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeShouldLog( - JNIEnv* env, jclass clazz, jshort severity, jshort verbosity) { +jboolean nativeShouldLog(JNIEnv* env, jclass clazz, jshort severity, + jshort verbosity) { if (!icing::lib::LogSeverity::Code_IsValid(severity)) { ICING_LOG(icing::lib::ERROR) << "Invalid value for logging severity: " << severity; @@ -497,9 +452,8 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeShouldLog( static_cast(severity), verbosity); } -JNIEXPORT jboolean JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetLoggingLevel( - JNIEnv* env, jclass clazz, jshort severity, jshort verbosity) { +jboolean nativeSetLoggingLevel(JNIEnv* env, jclass clazz, jshort severity, + jshort verbosity) { if (!icing::lib::LogSeverity::Code_IsValid(severity)) { ICING_LOG(icing::lib::ERROR) << "Invalid value for logging severity: " << severity; @@ -509,216 +463,111 @@ Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetLoggingLevel( static_cast(severity), verbosity); } -JNIEXPORT jstring JNICALL -Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetLoggingTag( - JNIEnv* env, jclass clazz) { +jstring nativeGetLoggingTag(JNIEnv* env, jclass clazz) { return env->NewStringUTF(icing::lib::kIcingLoggingTag); } -// TODO(b/240333360) Remove the methods below for IcingSearchEngine once we have -// a sync from Jetpack to g3 to contain the refactored IcingSearchEngine(with -// IcingSearchEngineImpl). -JNIEXPORT jlong JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeCreate( - JNIEnv* env, jclass clazz, jbyteArray icing_search_engine_options_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeCreate( - env, clazz, icing_search_engine_options_bytes); -} - -JNIEXPORT void JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDestroy(JNIEnv* env, - jclass clazz, - jobject object) { - Java_com_google_android_icing_IcingSearchEngineImpl_nativeDestroy(env, clazz, - object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeInitialize( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeInitialize( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSetSchema( - JNIEnv* env, jclass clazz, jobject object, jbyteArray schema_bytes, - jboolean ignore_errors_and_delete_documents) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetSchema( - env, clazz, object, schema_bytes, ignore_errors_and_delete_documents); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetSchema( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchema( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetSchemaType( - JNIEnv* env, jclass clazz, jobject object, jstring schema_type) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetSchemaType( - env, clazz, object, schema_type); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativePut( - JNIEnv* env, jclass clazz, jobject object, jbyteArray document_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativePut( - env, clazz, object, document_bytes); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGet( - JNIEnv* env, jclass clazz, jobject object, jstring name_space, jstring uri, - jbyteArray result_spec_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGet( - env, clazz, object, name_space, uri, result_spec_bytes); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeReportUsage( - JNIEnv* env, jclass clazz, jobject object, jbyteArray usage_report_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeReportUsage( - env, clazz, object, usage_report_bytes); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetAllNamespaces( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetAllNamespaces( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetNextPage( - JNIEnv* env, jclass clazz, jobject object, jlong next_page_token, - jlong java_to_native_start_timestamp_ms) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetNextPage( - env, clazz, object, next_page_token, java_to_native_start_timestamp_ms); -} - -JNIEXPORT void JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeInvalidateNextPageToken( - JNIEnv* env, jclass clazz, jobject object, jlong next_page_token) { - Java_com_google_android_icing_IcingSearchEngineImpl_nativeInvalidateNextPageToken( - env, clazz, object, next_page_token); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSearch( - JNIEnv* env, jclass clazz, jobject object, jbyteArray search_spec_bytes, - jbyteArray scoring_spec_bytes, jbyteArray result_spec_bytes, - jlong java_to_native_start_timestamp_ms) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearch( - env, clazz, object, search_spec_bytes, scoring_spec_bytes, - result_spec_bytes, java_to_native_start_timestamp_ms); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDelete(JNIEnv* env, - jclass clazz, - jobject object, - jstring name_space, - jstring uri) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeDelete( - env, clazz, object, name_space, uri); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByNamespace( - JNIEnv* env, jclass clazz, jobject object, jstring name_space) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByNamespace( - env, clazz, object, name_space); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDeleteBySchemaType( - JNIEnv* env, jclass clazz, jobject object, jstring schema_type) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteBySchemaType( - env, clazz, object, schema_type); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByQuery( - JNIEnv* env, jclass clazz, jobject object, jbyteArray search_spec_bytes, - jboolean return_deleted_document_info) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeDeleteByQuery( - env, clazz, object, search_spec_bytes, return_deleted_document_info); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk( - JNIEnv* env, jclass clazz, jobject object, jint persist_type_code) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativePersistToDisk( - env, clazz, object, persist_type_code); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeOptimize(JNIEnv* env, - jclass clazz, - jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeOptimize( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetOptimizeInfo( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetStorageInfo( - JNIEnv* env, jclass clazz, jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetStorageInfo( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeReset(JNIEnv* env, - jclass clazz, - jobject object) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeReset( - env, clazz, object); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSearchSuggestions( - JNIEnv* env, jclass clazz, jobject object, - jbyteArray suggestion_spec_bytes) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeSearchSuggestions( - env, clazz, object, suggestion_spec_bytes); -} - -JNIEXPORT jbyteArray JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetDebugInfo( - JNIEnv* env, jclass clazz, jobject object, jint verbosity) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetDebugInfo( - env, clazz, object, verbosity); -} - -JNIEXPORT jboolean JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeShouldLog( - JNIEnv* env, jclass clazz, jshort severity, jshort verbosity) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeShouldLog( - env, clazz, severity, verbosity); -} +#pragma clang diagnostic ignored "-Wwrite-strings" +jint JNI_OnLoad(JavaVM* vm, void* reserved) { + JNIEnv* env; + if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6) != JNI_OK) { + ICING_LOG(icing::lib::ERROR) << "ERROR: GetEnv failed"; + return JNI_ERR; + } -JNIEXPORT jboolean JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeSetLoggingLevel( - JNIEnv* env, jclass clazz, jshort severity, jshort verbosity) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeSetLoggingLevel( - env, clazz, severity, verbosity); -} + // Find your class. JNI_OnLoad is called from the correct class loader context + // for this to work. + jclass java_class = + env->FindClass("com/google/android/icing/IcingSearchEngineImpl"); + if (java_class == nullptr) { + return JNI_ERR; + } + JavaIcingSearchEngineImpl.native_pointer = + env->GetFieldID(java_class, "nativePointer", "J"); + + // Register your class' native methods. + static const JNINativeMethod methods[] = { + {"nativeCreate", "([B)J", reinterpret_cast(nativeCreate)}, + {"nativeDestroy", "(Lcom/google/android/icing/IcingSearchEngineImpl;)V", + reinterpret_cast(nativeDestroy)}, + {"nativeInitialize", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast(nativeInitialize)}, + {"nativeSetSchema", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[BZ)[B", + reinterpret_cast(nativeSetSchema)}, + {"nativeGetSchema", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast(nativeGetSchema)}, + {"nativeGetSchemaType", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B", + reinterpret_cast(nativeGetSchemaType)}, + {"nativePut", "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B", + reinterpret_cast(nativePut)}, + {"nativeGet", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/" + "String;Ljava/lang/String;[B)[B", + reinterpret_cast(nativeGet)}, + {"nativeReportUsage", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B", + reinterpret_cast(nativeReportUsage)}, + {"nativeGetAllNamespaces", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast(nativeGetAllNamespaces)}, + {"nativeGetNextPage", + "(Lcom/google/android/icing/IcingSearchEngineImpl;JJ)[B", + reinterpret_cast(nativeGetNextPage)}, + {"nativeInvalidateNextPageToken", + "(Lcom/google/android/icing/IcingSearchEngineImpl;J)V", + reinterpret_cast(nativeInvalidateNextPageToken)}, + {"nativeSearch", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[B[B[BJ)[B", + reinterpret_cast(nativeSearch)}, + {"nativeDelete", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/" + "String;Ljava/lang/String;)[B", + reinterpret_cast(nativeDelete)}, + {"nativeDeleteByNamespace", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B", + reinterpret_cast(nativeDeleteByNamespace)}, + {"nativeDeleteBySchemaType", + "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B", + reinterpret_cast(nativeDeleteBySchemaType)}, + {"nativeDeleteByQuery", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[BZ)[B", + reinterpret_cast(nativeDeleteByQuery)}, + {"nativePersistToDisk", + "(Lcom/google/android/icing/IcingSearchEngineImpl;I)[B", + reinterpret_cast(nativePersistToDisk)}, + {"nativeOptimize", "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast(nativeOptimize)}, + {"nativeGetOptimizeInfo", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast(nativeGetOptimizeInfo)}, + {"nativeGetStorageInfo", + "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast(nativeGetStorageInfo)}, + {"nativeReset", "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B", + reinterpret_cast(nativeReset)}, + {"nativeSearchSuggestions", + "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B", + reinterpret_cast(nativeSearchSuggestions)}, + {"nativeGetDebugInfo", + "(Lcom/google/android/icing/IcingSearchEngineImpl;I)[B", + reinterpret_cast(nativeGetDebugInfo)}, + {"nativeShouldLog", "(SS)Z", reinterpret_cast(nativeShouldLog)}, + {"nativeSetLoggingLevel", "(SS)Z", + reinterpret_cast(nativeSetLoggingLevel)}, + {"nativeGetLoggingTag", "()Ljava/lang/String;", + reinterpret_cast(nativeGetLoggingTag)}, + }; + int register_natives_success = env->RegisterNatives( + java_class, methods, sizeof(methods) / sizeof(JNINativeMethod)); + if (register_natives_success != JNI_OK) { + return register_natives_success; + } -JNIEXPORT jstring JNICALL -Java_com_google_android_icing_IcingSearchEngine_nativeGetLoggingTag( - JNIEnv* env, jclass clazz) { - return Java_com_google_android_icing_IcingSearchEngineImpl_nativeGetLoggingTag( - env, clazz); + return JNI_VERSION_1_6; } } // extern "C" diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 3ec2c2d..e7bee73 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=553570527) +set(synced_AOSP_CL_number=559533396) -- cgit v1.2.3 From ec9c4f473d9b5b6d316405f5057eeeddbaa27ff5 Mon Sep 17 00:00:00 2001 From: Aurimas Liutikas Date: Wed, 23 Aug 2023 15:32:13 -0700 Subject: Move icing to be a simple java project Since icing is only bundled inside of appsearch:appsearch-local-storage there is no need to use an android project as at the end of the day we end up only including the jar of icing anyway. Moving to a java project simplifies is as now we no longer need to add special export configurations. We are also moving the dropping of *.proto files to the projects that bundle the libraries. This is a step towards making it easier to resolve what artifact coordinates were actually bundled inside. Test: ./gradlew appsearch:appsearch-local-storage:publish comparing out/androidx/build/support_repo/androidx/appsearch/appsearch-local-storage/1.1.0-alpha04/appsearch-local-storage-1.1.0-alpha04.aar before and after the change, libs/repackaged.jar is identical Bug: 296864329 Change-Id: Ibba769f71ecdb58cb20df0ea0c37364cf579eddf --- build.gradle | 79 ++++++---------------- .../google/android/icing/IcingSearchEngine.java | 1 + .../android/icing/IcingSearchEngineImpl.java | 1 + 3 files changed, 22 insertions(+), 59 deletions(-) diff --git a/build.gradle b/build.gradle index dde0637..d0d1a39 100644 --- a/build.gradle +++ b/build.gradle @@ -14,57 +14,42 @@ * limitations under the License. */ +import androidx.build.SdkHelperKt + plugins { - id('AndroidXPlugin') - id('com.android.library') - id('com.google.protobuf') + id("AndroidXPlugin") + id("java-library") + id("com.google.protobuf") } -android { - compileOptions { - sourceCompatibility = JavaVersion.VERSION_1_8 - targetCompatibility = JavaVersion.VERSION_1_8 - } - sourceSets { - main { - java.srcDir 'java/src/' - proto.srcDir 'proto/' - } - // TODO(b/161205849): Re-enable this test once icing nativeLib is no longer being built - // inside appsearch:appsearch. - //androidTest.java.srcDir 'java/tests/instrumentation/' +sourceSets { + main { + java.srcDir 'java/src/' + proto.srcDir 'proto/' } - namespace "com.google.android.icing" -} - -// This project has no device tests, skip building it -androidComponents { - beforeVariants(selector().withName("debug"), { variantBuilder -> - variantBuilder.enableAndroidTest = false - }) } dependencies { - api('androidx.annotation:annotation:1.1.0') - - implementation('com.google.protobuf:protobuf-javalite:3.10.0') + compileOnly("androidx.annotation:annotation:1.1.0") + compileOnly(SdkHelperKt.getSdkDependency(project)) + implementation(libs.protobufLite) +} - androidTestImplementation(libs.testCore) - androidTestImplementation(libs.testRules) - androidTestImplementation(libs.truth) - androidTestImplementation(libs.kotlinBom) +afterEvaluate { + lint { + lintOptions { + // protobuf generates unannotated methods + disable("UnknownNullness") + } + } } protobuf { protoc { artifact = libs.protobufCompiler.get() } - generateProtoTasks { all().each { task -> - project.tasks.named("extractReleaseAnnotations").configure { - it.dependsOn(task) - } task.builtins { java { option 'lite' @@ -74,30 +59,6 @@ protobuf { } } -// Create export artifact for all variants (debug/release) for JarJaring -android.libraryVariants.all { variant -> - def variantName = variant.name - def suffix = variantName.capitalize() - def exportJarTask = tasks.register("exportJar${suffix}", Jar) { - archiveBaseName.set("icing-${variantName}") - - // The proto-lite dependency includes .proto files, which are not used by icing. When apps - // depend on appsearch as well as proto-lite directly, these files conflict since jarjar - // only renames the java classes. Remove them here since they are unused. - // Expand the jar and remove any .proto files. - from(zipTree(configurations.detachedConfiguration( - dependencies.create(libs.protobufLite.get())).getSingleFile())) { - exclude("**/*.proto") - } - - from files(variant.javaCompileProvider.get().destinationDir) - dependsOn variant.javaCompileProvider.get() - } - - def exportConfiguration = configurations.register("export${suffix}") - artifacts.add(exportConfiguration.name, exportJarTask.flatMap { it.archiveFile }) -} - androidx { mavenVersion = LibraryVersions.APPSEARCH } diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java index 47b94a5..79fcdb8 100644 --- a/java/src/com/google/android/icing/IcingSearchEngine.java +++ b/java/src/com/google/android/icing/IcingSearchEngine.java @@ -77,6 +77,7 @@ public class IcingSearchEngine implements IcingSearchEngineInterface { icingSearchEngineImpl.close(); } + @SuppressWarnings("deprecation") @Override protected void finalize() throws Throwable { icingSearchEngineImpl.close(); diff --git a/java/src/com/google/android/icing/IcingSearchEngineImpl.java b/java/src/com/google/android/icing/IcingSearchEngineImpl.java index 8e79a88..57744c4 100644 --- a/java/src/com/google/android/icing/IcingSearchEngineImpl.java +++ b/java/src/com/google/android/icing/IcingSearchEngineImpl.java @@ -71,6 +71,7 @@ public class IcingSearchEngineImpl implements Closeable { closed = true; } + @SuppressWarnings("deprecation") @Override protected void finalize() throws Throwable { close(); -- cgit v1.2.3 From 6293cd0a843b1e5b09086d9010ec863556d0c1ba Mon Sep 17 00:00:00 2001 From: Grace Zhao Date: Wed, 30 Aug 2023 23:54:01 -0700 Subject: Update Icing from upstream. Descriptions: ======================================================================== [fixit] Remove incorrect reserve usage for STL data structure ======================================================================== Move LiteIndex::SortHits() from the query path to the indexing path ======================================================================== Bug: 286418010 Bug: 297549761 Bug: 248295995 Change-Id: I46168178e0227451b0f7dbf394bb6cc5db94e1e8 --- icing/icing-search-engine.cc | 8 +- icing/icing-search-engine_initialization_test.cc | 44 +- icing/index/index-processor_benchmark.cc | 4 +- icing/index/index-processor_test.cc | 12 +- icing/index/index.cc | 21 +- icing/index/index.h | 30 +- icing/index/index_test.cc | 66 ++- icing/index/lite/lite-index-options.cc | 10 +- icing/index/lite/lite-index-options.h | 6 +- icing/index/lite/lite-index.cc | 269 +++++---- icing/index/lite/lite-index.h | 83 ++- icing/index/lite/lite-index_test.cc | 641 ++++++++++++++++++++- icing/index/lite/lite-index_thread-safety_test.cc | 7 +- icing/index/lite/term-id-hit-pair.h | 2 + icing/index/main/main-index-merger_test.cc | 4 +- icing/index/main/main-index_test.cc | 12 +- icing/index/string-section-indexing-handler.cc | 11 + .../index/string-section-indexing-handler_test.cc | 587 +++++++++++++++++++ .../advanced_query_parser/query-visitor_test.cc | 4 +- icing/query/query-processor_benchmark.cc | 4 +- icing/query/query-processor_test.cc | 4 +- icing/query/suggestion-processor_test.cc | 4 +- icing/schema/schema-util.cc | 5 - proto/icing/proto/initialize.proto | 22 +- proto/icing/proto/logging.proto | 7 +- synced_AOSP_CL_number.txt | 2 +- 26 files changed, 1661 insertions(+), 208 deletions(-) create mode 100644 icing/index/string-section-indexing-handler_test.cc diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 467c943..6680dae 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -654,7 +654,9 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( // We're going to need to build the index from scratch. So just delete its // directory now. // Discard index directory and instantiate a new one. - Index::Options index_options(index_dir, options_.index_merge_size()); + Index::Options index_options(index_dir, options_.index_merge_size(), + options_.lite_index_sort_at_indexing(), + options_.lite_index_sort_size()); if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) || !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) { return absl_ports::InternalError( @@ -798,7 +800,9 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( return absl_ports::InternalError( absl_ports::StrCat("Could not create directory: ", index_dir)); } - Index::Options index_options(index_dir, options_.index_merge_size()); + Index::Options index_options(index_dir, options_.index_merge_size(), + options_.lite_index_sort_at_indexing(), + options_.lite_index_sort_size()); // Term index InitializeStatsProto::RecoveryCause index_recovery_cause; diff --git a/icing/icing-search-engine_initialization_test.cc b/icing/icing-search-engine_initialization_test.cc index d6c8de8..b4853b4 100644 --- a/icing/icing-search-engine_initialization_test.cc +++ b/icing/icing-search-engine_initialization_test.cc @@ -1853,7 +1853,9 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr index, Index::Create(Index::Options(GetIndexDir(), - /*index_merge_size=*/100), + /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); ICING_ASSERT_OK(index->PersistToDisk()); } @@ -2463,7 +2465,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); DocumentId original_last_added_doc_id = index->last_added_document_id(); index->set_last_added_document_id(original_last_added_doc_id + 1); @@ -2595,7 +2599,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr doc_hit_info_iter, @@ -2707,7 +2713,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); DocumentId original_last_added_doc_id = index->last_added_document_id(); index->set_last_added_document_id(original_last_added_doc_id + 1); @@ -2844,7 +2852,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr doc_hit_info_iter, @@ -2904,7 +2914,9 @@ TEST_F(IcingSearchEngineInitializationTest, Index::Create( // index merge size is not important here because we will manually // invoke merge below. - Index::Options(GetIndexDir(), /*index_merge_size=*/100), + Index::Options(GetIndexDir(), /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); // Add hits for document 0 and merge. ASSERT_THAT(index->last_added_document_id(), kInvalidDocumentId); @@ -2980,7 +2992,9 @@ TEST_F(IcingSearchEngineInitializationTest, { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr index, - Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100), + Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr doc_hit_info_iter, @@ -3096,7 +3110,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/message.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/8), filesystem(), icing_filesystem())); // Add hits for document 4 and merge. DocumentId original_last_added_doc_id = index->last_added_document_id(); @@ -3239,7 +3255,9 @@ TEST_F(IcingSearchEngineInitializationTest, { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr index, - Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100), + Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr doc_hit_info_iter, @@ -4412,7 +4430,9 @@ TEST_F(IcingSearchEngineInitializationTest, ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr index, Index::Create(Index::Options(GetIndexDir(), - /*index_merge_size=*/100), + /*index_merge_size=*/100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/50), filesystem(), icing_filesystem())); ICING_ASSERT_OK(index->PersistToDisk()); } @@ -5241,7 +5261,9 @@ TEST_P(IcingSearchEngineInitializationVersionChangeTest, ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, document_store->Put(message)); // Index doc_id with incorrect data - Index::Options options(GetIndexDir(), /*index_merge_size=*/1024 * 1024); + Index::Options options(GetIndexDir(), /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr index, Index::Create(options, filesystem(), icing_filesystem())); diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 9fee925..8766f0b 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -150,7 +150,9 @@ DocumentProto CreateDocumentWithHiragana(int content_length) { std::unique_ptr CreateIndex(const IcingFilesystem& icing_filesystem, const Filesystem& filesystem, const std::string& index_dir) { - Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10); + Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); return Index::Create(options, &filesystem, &icing_filesystem).ValueOrDie(); } diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index c2eaf62..ba4ece3 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -167,7 +167,9 @@ class IndexProcessorTest : public Test { schema_store_dir_ = base_dir_ + "/schema_store"; doc_store_dir_ = base_dir_ + "/doc_store"; - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -970,7 +972,9 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); Index::Options options(index_dir_, - /*index_merge_size=*/document.ByteSizeLong() * 100); + /*index_merge_size=*/document.ByteSizeLong() * 100, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1033,7 +1037,9 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { // 2. Recreate the index with the mock filesystem and a merge size that will // only allow one document to be added before requiring a merge. Index::Options options(index_dir_, - /*index_merge_size=*/document.ByteSizeLong()); + /*index_merge_size=*/document.ByteSizeLong(), + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/16); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, mock_icing_filesystem_.get())); diff --git a/icing/index/index.cc b/icing/index/index.cc index 19edbb6..31dcc7e 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -14,31 +14,38 @@ #include "icing/index/index.h" +#include +#include #include #include #include #include +#include #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" +#include "icing/file/filesystem.h" #include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator-or.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/doc-hit-info-iterator-term-lite.h" #include "icing/index/lite/lite-index.h" #include "icing/index/main/doc-hit-info-iterator-term-main.h" +#include "icing/index/main/main-index.h" #include "icing/index/term-id-codec.h" -#include "icing/index/term-property-id.h" +#include "icing/index/term-metadata.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/scoring.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/scoring/ranker.h" #include "icing/store/document-id.h" +#include "icing/store/suggestion-result-checker.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" @@ -59,7 +66,9 @@ libtextclassifier3::StatusOr CreateLiteIndexOptions( options.index_merge_size)); } return LiteIndex::Options(options.base_dir + "/idx/lite.", - options.index_merge_size); + options.index_merge_size, + options.lite_index_sort_at_indexing, + options.lite_index_sort_size); } std::string MakeMainIndexFilepath(const std::string& base_dir) { @@ -151,9 +160,17 @@ libtextclassifier3::StatusOr> Index::Create( IcingDynamicTrie::max_value_index(GetMainLexiconOptions()), IcingDynamicTrie::max_value_index( lite_index_options.lexicon_options))); + ICING_ASSIGN_OR_RETURN( std::unique_ptr lite_index, LiteIndex::Create(lite_index_options, icing_filesystem)); + // Sort the lite index if we've enabled sorting the HitBuffer at indexing + // time, and there's an unsorted tail exceeding the threshold. + if (options.lite_index_sort_at_indexing && + lite_index->HasUnsortedHitsExceedingSortThreshold()) { + lite_index->SortHits(); + } + ICING_ASSIGN_OR_RETURN( std::unique_ptr main_index, MainIndex::Create(MakeMainIndexFilepath(options.base_dir), filesystem, diff --git a/icing/index/index.h b/icing/index/index.h index c170278..32ea97b 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -18,8 +18,9 @@ #include #include #include -#include +#include #include +#include #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" @@ -27,6 +28,7 @@ #include "icing/index/hit/hit.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/lite-index.h" +#include "icing/index/lite/term-id-hit-pair.h" #include "icing/index/main/main-index-merger.h" #include "icing/index/main/main-index.h" #include "icing/index/term-id-codec.h" @@ -40,7 +42,7 @@ #include "icing/store/document-id.h" #include "icing/store/namespace-id.h" #include "icing/store/suggestion-result-checker.h" -#include "icing/util/crc32.h" +#include "icing/util/status-macros.h" namespace icing { namespace lib { @@ -68,11 +70,18 @@ namespace lib { class Index { public: struct Options { - explicit Options(const std::string& base_dir, uint32_t index_merge_size) - : base_dir(base_dir), index_merge_size(index_merge_size) {} + explicit Options(const std::string& base_dir, uint32_t index_merge_size, + bool lite_index_sort_at_indexing, + uint32_t lite_index_sort_size) + : base_dir(base_dir), + index_merge_size(index_merge_size), + lite_index_sort_at_indexing(lite_index_sort_at_indexing), + lite_index_sort_size(lite_index_sort_size) {} std::string base_dir; int32_t index_merge_size; + bool lite_index_sort_at_indexing; + int32_t lite_index_sort_size; }; // Creates an instance of Index in the directory pointed by file_dir. @@ -279,6 +288,19 @@ class Index { return lite_index_->Reset(); } + // Whether the LiteIndex HitBuffer requires sorting. This is only true if + // Icing has enabled sorting during indexing time, and the HitBuffer's + // unsorted tail has exceeded the lite_index_sort_size. + bool LiteIndexNeedSort() const { + return options_.lite_index_sort_at_indexing && + lite_index_->HasUnsortedHitsExceedingSortThreshold(); + } + + // Sorts the LiteIndex HitBuffer. + void SortLiteIndex() { + lite_index_->SortHits(); + } + // Reduces internal file sizes by reclaiming space of deleted documents. // new_last_added_document_id will be used to update the last added document // id in the lite index. diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index d563bcb..b823535 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -58,6 +58,7 @@ using ::testing::Eq; using ::testing::Ge; using ::testing::Gt; using ::testing::IsEmpty; +using ::testing::IsFalse; using ::testing::IsTrue; using ::testing::Ne; using ::testing::NiceMock; @@ -75,7 +76,9 @@ class IndexTest : public Test { protected: void SetUp() override { index_dir_ = GetTestTempDir() + "/index_test/"; - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); } @@ -146,7 +149,9 @@ MATCHER_P2(EqualsTermMetadata, content, hit_count, "") { } TEST_F(IndexTest, CreationWithNullPointerShouldFail) { - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); EXPECT_THAT( Index::Create(options, &filesystem_, /*icing_filesystem=*/nullptr), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); @@ -192,6 +197,36 @@ TEST_F(IndexTest, EmptyIndexAfterMerge) { StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); } +TEST_F(IndexTest, CreationWithLiteIndexSortAtIndexingEnabledShouldSort) { + // Make the index with lite_index_sort_at_indexing=false and a very small sort + // threshold. + Index::Options options(index_dir_, /*index_merge_size=*/1024, + /*lite_index_sort_at_indexing=*/false, + /*lite_index_sort_size=*/16); + ICING_ASSERT_OK_AND_ASSIGN( + index_, Index::Create(options, &filesystem_, &icing_filesystem_)); + + Index::Editor edit = index_->Edit( + kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + ASSERT_THAT(edit.BufferTerm("bar"), IsOk()); + ASSERT_THAT(edit.BufferTerm("baz"), IsOk()); + ASSERT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + // Persist and recreate the index with lite_index_sort_at_indexing=true + ASSERT_THAT(index_->PersistToDisk(), IsOk()); + options = Index::Options(index_dir_, /*index_merge_size=*/1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/16); + ICING_ASSERT_OK_AND_ASSIGN( + index_, Index::Create(options, &filesystem_, &icing_filesystem_)); + + // Check that the index is sorted after recreating with + // lite_index_sort_at_indexing, with the unsorted HitBuffer exceeding the sort + // threshold. + EXPECT_THAT(index_->LiteIndexNeedSort(), IsFalse()); +} + TEST_F(IndexTest, AdvancePastEnd) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); @@ -967,7 +1002,9 @@ TEST_F(IndexTest, NonAsciiTermsAfterMerge) { TEST_F(IndexTest, FullIndex) { // Make a smaller index so that it's easier to fill up. - Index::Options options(index_dir_, /*index_merge_size=*/1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1035,7 +1072,9 @@ TEST_F(IndexTest, FullIndex) { TEST_F(IndexTest, FullIndexMerge) { // Make a smaller index so that it's easier to fill up. - Index::Options options(index_dir_, /*index_merge_size=*/1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1368,7 +1407,9 @@ TEST_F(IndexTest, IndexCreateIOFailure) { NiceMock mock_icing_filesystem; ON_CALL(mock_icing_filesystem, CreateDirectoryRecursively) .WillByDefault(Return(false)); - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); EXPECT_THAT(Index::Create(options, &filesystem_, &mock_icing_filesystem), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } @@ -1399,7 +1440,9 @@ TEST_F(IndexTest, IndexCreateCorruptionFailure) { IsTrue()); // Recreate the index. - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); EXPECT_THAT(Index::Create(options, &filesystem_, &icing_filesystem_), StatusIs(libtextclassifier3::StatusCode::DATA_LOSS)); } @@ -1417,7 +1460,9 @@ TEST_F(IndexTest, IndexPersistence) { index_.reset(); // Recreate the index. - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1446,7 +1491,9 @@ TEST_F(IndexTest, IndexPersistenceAfterMerge) { index_.reset(); // Recreate the index. - Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); @@ -1463,7 +1510,8 @@ TEST_F(IndexTest, IndexPersistenceAfterMerge) { TEST_F(IndexTest, InvalidHitBufferSize) { Index::Options options( - index_dir_, /*index_merge_size=*/std::numeric_limits::max()); + index_dir_, /*index_merge_size=*/std::numeric_limits::max(), + /*lite_index_sort_at_indexing=*/true, /*lite_index_sort_size=*/1024 * 8); EXPECT_THAT(Index::Create(options, &filesystem_, &icing_filesystem_), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } diff --git a/icing/index/lite/lite-index-options.cc b/icing/index/lite/lite-index-options.cc index 29075f8..8780d45 100644 --- a/icing/index/lite/lite-index-options.cc +++ b/icing/index/lite/lite-index-options.cc @@ -14,6 +14,8 @@ #include "icing/index/lite/lite-index-options.h" +#include + #include "icing/index/lite/term-id-hit-pair.h" namespace icing { @@ -64,9 +66,13 @@ IcingDynamicTrie::Options CalculateTrieOptions(uint32_t hit_buffer_size) { } // namespace LiteIndexOptions::LiteIndexOptions(const std::string& filename_base, - uint32_t hit_buffer_want_merge_bytes) + uint32_t hit_buffer_want_merge_bytes, + bool hit_buffer_sort_at_indexing, + uint32_t hit_buffer_sort_threshold_bytes) : filename_base(filename_base), - hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes) { + hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes), + hit_buffer_sort_at_indexing(hit_buffer_sort_at_indexing), + hit_buffer_sort_threshold_bytes(hit_buffer_sort_threshold_bytes) { hit_buffer_size = CalculateHitBufferSize(hit_buffer_want_merge_bytes); lexicon_options = CalculateTrieOptions(hit_buffer_size); display_mappings_options = CalculateTrieOptions(hit_buffer_size); diff --git a/icing/index/lite/lite-index-options.h b/icing/index/lite/lite-index-options.h index ae58802..9f8452c 100644 --- a/icing/index/lite/lite-index-options.h +++ b/icing/index/lite/lite-index-options.h @@ -27,7 +27,9 @@ struct LiteIndexOptions { // hit_buffer_want_merge_bytes and the logic in CalculateHitBufferSize and // CalculateTrieOptions. LiteIndexOptions(const std::string& filename_base, - uint32_t hit_buffer_want_merge_bytes); + uint32_t hit_buffer_want_merge_bytes, + bool hit_buffer_sort_at_indexing, + uint32_t hit_buffer_sort_threshold_bytes); IcingDynamicTrie::Options lexicon_options; IcingDynamicTrie::Options display_mappings_options; @@ -35,6 +37,8 @@ struct LiteIndexOptions { std::string filename_base; uint32_t hit_buffer_want_merge_bytes = 0; uint32_t hit_buffer_size = 0; + bool hit_buffer_sort_at_indexing = false; + uint32_t hit_buffer_sort_threshold_bytes = 0; }; } // namespace lib diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index bf54dec..ec7141a 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -36,6 +36,8 @@ #include "icing/index/hit/doc-hit-info.h" #include "icing/index/hit/hit.h" #include "icing/index/lite/lite-index-header.h" +#include "icing/index/lite/term-id-hit-pair.h" +#include "icing/index/term-id-codec.h" #include "icing/index/term-property-id.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/core/icing-timer.h" @@ -44,10 +46,13 @@ #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mmapper.h" #include "icing/proto/debug.pb.h" +#include "icing/proto/scoring.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-id.h" +#include "icing/store/suggestion-result-checker.h" #include "icing/util/crc32.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" @@ -160,7 +165,7 @@ libtextclassifier3::Status LiteIndex::Initialize() { } // Set up header. - header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size()); + header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size()); header_ = std::make_unique( reinterpret_cast( header_mmap_.address())); @@ -175,7 +180,7 @@ libtextclassifier3::Status LiteIndex::Initialize() { UpdateChecksum(); } else { - header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size()); + header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size()); header_ = std::make_unique( reinterpret_cast( header_mmap_.address())); @@ -352,6 +357,73 @@ libtextclassifier3::StatusOr LiteIndex::GetTermId( return tvi; } +void LiteIndex::ScoreAndAppendFetchedHit( + const Hit& hit, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, + const SuggestionResultChecker* suggestion_result_checker, + DocumentId& last_document_id, bool& is_last_document_desired, + int& total_score_out, std::vector* hits_out, + std::vector* term_frequency_out) const { + // Check sections. + if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) { + return; + } + // Check prefix section only. + if (only_from_prefix_sections && !hit.is_in_prefix_section()) { + return; + } + // Check whether this Hit is desired. + // TODO(b/230553264) Move common logic into helper function once we support + // score term by prefix_hit in lite_index. + DocumentId document_id = hit.document_id(); + bool is_new_document = document_id != last_document_id; + if (is_new_document) { + last_document_id = document_id; + is_last_document_desired = + suggestion_result_checker == nullptr || + suggestion_result_checker->BelongsToTargetResults(document_id, + hit.section_id()); + } + if (!is_last_document_desired) { + // The document is removed or expired or not desired. + return; + } + + // Score the hit by the strategy + switch (score_by) { + case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE: + total_score_out = 1; + break; + case SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT: + if (is_new_document) { + ++total_score_out; + } + break; + case SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY: + if (hit.has_term_frequency()) { + total_score_out += hit.term_frequency(); + } else { + ++total_score_out; + } + break; + } + + // Append the Hit or update hit section to the output vector. + if (is_new_document && hits_out != nullptr) { + hits_out->push_back(DocHitInfo(document_id)); + if (term_frequency_out != nullptr) { + term_frequency_out->push_back(Hit::TermFrequencyArray()); + } + } + if (hits_out != nullptr) { + hits_out->back().UpdateSection(hit.section_id()); + if (term_frequency_out != nullptr) { + term_frequency_out->back()[hit.section_id()] = hit.term_frequency(); + } + } +} + int LiteIndex::FetchHits( uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, @@ -359,19 +431,38 @@ int LiteIndex::FetchHits( const SuggestionResultChecker* suggestion_result_checker, std::vector* hits_out, std::vector* term_frequency_out) { - int score = 0; - DocumentId last_document_id = kInvalidDocumentId; - // Record whether the last document belongs to the given namespaces. - bool is_last_document_desired = false; - - if (NeedSort()) { - // Transition from shared_lock in NeedSort to unique_lock here is safe - // because it doesn't hurt to sort again if sorting was done already by - // another thread after NeedSort is evaluated. NeedSort is called before - // sorting to improve concurrency as threads can avoid acquiring the unique - // lock if no sorting is needed. + bool need_sort_at_querying = false; + { + absl_ports::shared_lock l(&mutex_); + + // We sort here when: + // 1. We don't enable sorting at indexing time (i.e. we sort at querying + // time), and there is an unsorted tail portion. OR + // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, + // regardless of whether or not hit_buffer_sort_at_indexing is enabled. + // This is more of a sanity check. We should not really be encountering + // this case. + need_sort_at_querying = NeedSortAtQuerying(); + } + if (need_sort_at_querying) { absl_ports::unique_lock l(&mutex_); - SortHits(); + IcingTimer timer; + + // Transition from shared_lock to unique_lock is safe here because it + // doesn't hurt to sort again if sorting was done already by another thread + // after need_sort_at_querying is evaluated. + // We check need_sort_at_querying to improve query concurrency as threads + // can avoid acquiring the unique lock if no sorting is needed. + SortHitsImpl(); + + if (options_.hit_buffer_sort_at_indexing) { + // This is the second case for sort. Log as this should be a very rare + // occasion. + ICING_LOG(WARNING) << "Sorting HitBuffer at querying time when " + "hit_buffer_sort_at_indexing is enabled. Sort and " + "merge HitBuffer in " + << timer.Elapsed() * 1000 << " ms."; + } } // This downgrade from an unique_lock to a shared_lock is safe because we're @@ -379,75 +470,72 @@ int LiteIndex::FetchHits( // only in Seek(). // Any operations that might execute in between the transition of downgrading // the lock here are guaranteed not to alter the searchable section (or the - // LiteIndex due to a global lock in IcingSearchEngine). + // LiteIndex) due to a global lock in IcingSearchEngine. absl_ports::shared_lock l(&mutex_); - for (uint32_t idx = Seek(term_id); idx < header_->searchable_end(); idx++) { - TermIdHitPair term_id_hit_pair = - hit_buffer_.array_cast()[idx]; - if (term_id_hit_pair.term_id() != term_id) break; - - const Hit& hit = term_id_hit_pair.hit(); - // Check sections. - if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) { - continue; - } - // Check prefix section only. - if (only_from_prefix_sections && !hit.is_in_prefix_section()) { - continue; - } - // TODO(b/230553264) Move common logic into helper function once we support - // score term by prefix_hit in lite_index. - // Check whether this Hit is desired. - DocumentId document_id = hit.document_id(); - bool is_new_document = document_id != last_document_id; - if (is_new_document) { - last_document_id = document_id; - is_last_document_desired = - suggestion_result_checker == nullptr || - suggestion_result_checker->BelongsToTargetResults(document_id, - hit.section_id()); - } - if (!is_last_document_desired) { - // The document is removed or expired or not desired. - continue; - } - // Score the hit by the strategy - switch (score_by) { - case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE: - score = 1; - break; - case SuggestionScoringSpecProto::SuggestionRankingStrategy:: - DOCUMENT_COUNT: - if (is_new_document) { - ++score; - } - break; - case SuggestionScoringSpecProto::SuggestionRankingStrategy:: - TERM_FREQUENCY: - if (hit.has_term_frequency()) { - score += hit.term_frequency(); - } else { - ++score; - } - break; - } + // Search in the HitBuffer array for Hits with the corresponding term_id. + // Hits are added in increasing order of doc ids, so hits that get appended + // later have larger docIds. This means that: + // 1. Hits in the unsorted tail will have larger docIds than hits in the + // sorted portion. + // 2. Hits at the end of the unsorted tail will have larger docIds than hits + // in the front of the tail. + // We want to retrieve hits in descending order of docIds. Therefore we should + // search by doing: + // 1. Linear search first in reverse iteration order over the unsorted tail + // portion. + // 2. Followed by binary search on the sorted portion. + const TermIdHitPair* array = hit_buffer_.array_cast(); - // Append the Hit or update hit section to the output vector. - if (is_new_document && hits_out != nullptr) { - hits_out->push_back(DocHitInfo(document_id)); - if (term_frequency_out != nullptr) { - term_frequency_out->push_back(Hit::TermFrequencyArray()); + DocumentId last_document_id = kInvalidDocumentId; + // Record whether the last document belongs to the given namespaces. + bool is_last_document_desired = false; + int total_score = 0; + + // Linear search over unsorted tail in reverse iteration order. + // This should only be performed when hit_buffer_sort_at_indexing is enabled. + // When disabled, the entire HitBuffer should be sorted already and only + // binary search is needed. + if (options_.hit_buffer_sort_at_indexing) { + uint32_t unsorted_length = header_->cur_size() - header_->searchable_end(); + for (uint32_t i = 1; i <= unsorted_length; ++i) { + TermIdHitPair term_id_hit_pair = array[header_->cur_size() - i]; + if (term_id_hit_pair.term_id() == term_id) { + // We've found a matched hit. + const Hit& matched_hit = term_id_hit_pair.hit(); + // Score the hit and add to total_score. Also add the hits and its term + // frequency info to hits_out and term_frequency_out if the two vectors + // are non-null. + ScoreAndAppendFetchedHit(matched_hit, section_id_mask, + only_from_prefix_sections, score_by, + suggestion_result_checker, last_document_id, + is_last_document_desired, total_score, + hits_out, term_frequency_out); } } - if (hits_out != nullptr) { - hits_out->back().UpdateSection(hit.section_id()); - if (term_frequency_out != nullptr) { - term_frequency_out->back()[hit.section_id()] = hit.term_frequency(); - } + } + + // Do binary search over the sorted section and repeat the above steps. + TermIdHitPair target_term_id_hit_pair( + term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency)); + for (const TermIdHitPair* ptr = std::lower_bound( + array, array + header_->searchable_end(), target_term_id_hit_pair); + ptr < array + header_->searchable_end(); ++ptr) { + if (ptr->term_id() != term_id) { + // We've processed all matches. Stop iterating further. + break; } + + const Hit& matched_hit = ptr->hit(); + // Score the hit and add to total_score. Also add the hits and its term + // frequency info to hits_out and term_frequency_out if the two vectors are + // non-null. + ScoreAndAppendFetchedHit( + matched_hit, section_id_mask, only_from_prefix_sections, score_by, + suggestion_result_checker, last_document_id, is_last_document_desired, + total_score, hits_out, term_frequency_out); } - return score; + return total_score; } libtextclassifier3::StatusOr LiteIndex::ScoreHits( @@ -455,9 +543,9 @@ libtextclassifier3::StatusOr LiteIndex::ScoreHits( SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, const SuggestionResultChecker* suggestion_result_checker) { return FetchHits(term_id, kSectionIdMaskAll, - /*only_from_prefix_sections=*/false, score_by, - suggestion_result_checker, - /*hits_out=*/nullptr); + /*only_from_prefix_sections=*/false, score_by, + suggestion_result_checker, + /*hits_out=*/nullptr); } bool LiteIndex::is_full() const { @@ -515,7 +603,7 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo( return storage_info; } -void LiteIndex::SortHits() { +void LiteIndex::SortHitsImpl() { // Make searchable by sorting by hit buffer. uint32_t sort_len = header_->cur_size() - header_->searchable_end(); if (sort_len <= 0) { @@ -546,25 +634,6 @@ void LiteIndex::SortHits() { UpdateChecksum(); } -uint32_t LiteIndex::Seek(uint32_t term_id) const { - // Binary search for our term_id. Make sure we get the first - // element. Using kBeginSortValue ensures this for the hit value. - TermIdHitPair term_id_hit_pair( - term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency)); - - const TermIdHitPair::Value* array = - hit_buffer_.array_cast(); - if (header_->searchable_end() != header_->cur_size()) { - ICING_LOG(WARNING) << "Lite index: hit buffer searchable end != current " - << "size during Seek(): " - << header_->searchable_end() << " vs " - << header_->cur_size(); - } - const TermIdHitPair::Value* ptr = std::lower_bound( - array, array + header_->searchable_end(), term_id_hit_pair.value()); - return ptr - array; -} - libtextclassifier3::Status LiteIndex::Optimize( const std::vector& document_id_old_to_new, const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) { @@ -575,7 +644,7 @@ libtextclassifier3::Status LiteIndex::Optimize( } // Sort the hits so that hits with the same term id will be grouped together, // which helps later to determine which terms will be unused after compaction. - SortHits(); + SortHitsImpl(); uint32_t new_size = 0; uint32_t curr_term_id = 0; uint32_t curr_tvi = 0; diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 916a14b..288602a 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -20,6 +20,7 @@ #define ICING_INDEX_LITE_INDEX_H_ #include +#include #include #include #include @@ -48,7 +49,6 @@ #include "icing/store/document-id.h" #include "icing/store/namespace-id.h" #include "icing/store/suggestion-result-checker.h" -#include "icing/util/bit-util.h" #include "icing/util/crc32.h" namespace icing { @@ -63,6 +63,9 @@ class LiteIndex { // An entry in the hit buffer. using Options = LiteIndexOptions; + // Offset for the LiteIndex_Header in the hit buffer mmap. + static constexpr uint32_t kHeaderFileOffset = 0; + // Updates checksum of subcomponents. ~LiteIndex(); @@ -152,8 +155,8 @@ class LiteIndex { // Add all hits with term_id from the sections specified in section_id_mask, // skipping hits in non-prefix sections if only_from_prefix_sections is true, // to hits_out. If hits_out is nullptr, no hits will be added. The - // corresponding hit term frequencies will also be added if term_frequency_out - // is nullptr. + // corresponding hit term frequencies will also not be added if + // term_frequency_out is nullptr. // // Only those hits which belongs to the given namespaces will be counted and // fetched. A nullptr namespace checker will disable this check. @@ -181,15 +184,29 @@ class LiteIndex { uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) { absl_ports::shared_lock l(&mutex_); - return sizeLocked(); + return size_impl(); } bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) { absl_ports::shared_lock l(&mutex_); - return is_full() || sizeLocked() >= (options_.hit_buffer_want_merge_bytes / - sizeof(TermIdHitPair::Value)); + return is_full() || size_impl() >= (options_.hit_buffer_want_merge_bytes / + sizeof(TermIdHitPair::Value)); + } + + // Whether or not the HitBuffer's unsorted tail size exceeds the sort + // threshold. + bool HasUnsortedHitsExceedingSortThreshold() const + ICING_LOCKS_EXCLUDED(mutex_) { + absl_ports::shared_lock l(&mutex_); + return HasUnsortedHitsExceedingSortThresholdImpl(); } + // Sort hits stored in the index. + void SortHits() ICING_LOCKS_EXCLUDED(mutex_) { + absl_ports::unique_lock l(&mutex_); + SortHitsImpl(); + }; + class const_iterator { friend class LiteIndex; @@ -326,17 +343,13 @@ class LiteIndex { // Check if the hit buffer has reached its capacity. bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_); - uint32_t sizeLocked() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { - return header_->cur_size(); - } - // Non-locking implementation for empty(). bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { return size_impl() == 0; } // Non-locking implementation for size(). - bool size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { + uint32_t size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { return header_->cur_size(); } @@ -352,18 +365,48 @@ class LiteIndex { NamespaceId namespace_id) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Whether or not the HitBuffer requires sorting. - bool NeedSort() ICING_LOCKS_EXCLUDED(mutex_) { - absl_ports::shared_lock l(&mutex_); - return header_->cur_size() - header_->searchable_end() > 0; + // We need to sort during querying time when: + // 1. Sorting at indexing time is not enabled and there is an unsorted tail + // section in the HitBuffer. + // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, regardless + // of whether or not hit_buffer_sort_at_indexing is enabled. This is to + // prevent performing sequential search on a large unsorted tail section, + // which would result in bad query performance. + // This is more of a sanity check. We should not really be encountering + // this case. + bool NeedSortAtQuerying() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { + return HasUnsortedHitsExceedingSortThresholdImpl() || + (!options_.hit_buffer_sort_at_indexing && + header_->cur_size() - header_->searchable_end() > 0); } - // Sort hits stored in the index. - void SortHits() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Non-locking implementation for HasUnsortedHitsExceedingSortThresholdImpl(). + bool HasUnsortedHitsExceedingSortThresholdImpl() const + ICING_SHARED_LOCKS_REQUIRED(mutex_) { + return header_->cur_size() - header_->searchable_end() >= + (options_.hit_buffer_sort_threshold_bytes / + sizeof(TermIdHitPair::Value)); + } - // Returns the position of the first element with term_id, or the searchable - // end of the hit buffer if term_id is not present. - uint32_t Seek(uint32_t term_id) const ICING_SHARED_LOCKS_REQUIRED(mutex_); + // Non-locking implementation for SortHits(). + void SortHitsImpl() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Calculates and adds the score for a fetched hit to total_score_out, while + // updating last_document_id (which keeps track of the last added docId so + // far), and is_last_document_desired (which keeps track of whether that last + // added docId belongs to the query's desired namespace.) + // + // Also appends the hit to hits_out and term_frequency_out if the vectors are + // not null. + void ScoreAndAppendFetchedHit( + const Hit& hit, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, + const SuggestionResultChecker* suggestion_result_checker, + DocumentId& last_document_id, bool& is_last_document_desired, + int& total_score_out, std::vector* hits_out, + std::vector* term_frequency_out) const + ICING_SHARED_LOCKS_REQUIRED(mutex_); // File descriptor that points to where the header and hit buffer are written // to. diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc index 5f141ed..9811fa2 100644 --- a/icing/index/lite/lite-index_test.cc +++ b/icing/index/lite/lite-index_test.cc @@ -14,14 +14,27 @@ #include "icing/index/lite/lite-index.h" +#include +#include +#include +#include #include #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/lite/doc-hit-info-iterator-term-lite.h" +#include "icing/index/lite/lite-index-header.h" #include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-dynamic-trie.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/scoring.pb.h" +#include "icing/proto/term.pb.h" #include "icing/schema/section.h" -#include "icing/store/suggestion-result-checker.h" +#include "icing/store/namespace-id.h" #include "icing/testing/always-false-suggestion-result-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -34,6 +47,8 @@ namespace { using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsEmpty; +using ::testing::IsFalse; +using ::testing::IsTrue; using ::testing::SizeIs; class LiteIndexTest : public testing::Test { @@ -41,62 +56,518 @@ class LiteIndexTest : public testing::Test { void SetUp() override { index_dir_ = GetTestTempDir() + "/test_dir"; ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); - - std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; - LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); - ICING_ASSERT_OK_AND_ASSIGN(lite_index_, - LiteIndex::Create(options, &icing_filesystem_)); - - ICING_ASSERT_OK_AND_ASSIGN( - term_id_codec_, - TermIdCodec::Create( - IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), - IcingDynamicTrie::max_value_index(options.lexicon_options))); } void TearDown() override { term_id_codec_.reset(); - lite_index_.reset(); ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); } std::string index_dir_; Filesystem filesystem_; IcingFilesystem icing_filesystem_; - std::unique_ptr lite_index_; std::unique_ptr term_id_codec_; }; constexpr NamespaceId kNamespace0 = 0; -TEST_F(LiteIndexTest, LiteIndexAppendHits) { +TEST_F(LiteIndexTest, + LiteIndexFetchHits_sortAtQuerying_unsortedHitsBelowSortThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr lite_index, + LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( - uint32_t tvi, - lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Add some hits + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, - term_id_codec_->EncodeTvi(tvi, TviType::LITE)); - Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); - Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1)); + + // Check that unsorted hits does not exceed the sort threshold. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + + // Check that hits are unsorted. Persist the data and pread from + // LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); + + // Query the LiteIndex std::vector hits1; - lite_index_->FetchHits( + lite_index->FetchHits( foo_term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false, SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, /*namespace_checker=*/nullptr, &hits1); EXPECT_THAT(hits1, SizeIs(1)); - EXPECT_THAT(hits1.back().document_id(), Eq(0)); + EXPECT_THAT(hits1.back().document_id(), Eq(1)); // Check that the hits are coming from section 0 and section 1. EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); std::vector hits2; AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; - lite_index_->FetchHits( + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + // Check that hits are sorted after querying LiteIndex. Persist the data and + // pread from LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); +} + +TEST_F(LiteIndexTest, + LiteIndexFetchHits_sortAtIndexing_unsortedHitsBelowSortThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + // However note that in these tests we're unable to sort hits after + // indexing, as sorting performed by the string-section-indexing-handler + // after indexing all hits in an entire document, rather than after each + // AddHits() operation. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Add some hits + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1)); + + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1)); + + // Check that unsorted hits does not exceed the sort threshold. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + + // Check that hits are unsorted. Persist the data and pread from + // LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); + + // Query the LiteIndex + std::vector hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(1)); + EXPECT_THAT(hits1.back().document_id(), Eq(1)); + // Check that the hits are coming from section 0 and section 1. + EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); + + std::vector hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + // Check that hits are still unsorted after querying LiteIndex because the + // HitBuffer unsorted size is still below the sort threshold, and we've + // enabled sort_at_indexing. + // Persist the data and performing a pread on LiteIndexHeader. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4)); +} + +TEST_F( + LiteIndexTest, + LiteIndexFetchHits_sortAtQuerying_unsortedHitsExceedingSortAtIndexThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + // However note that in these tests we're unable to sort hits after + // indexing, as sorting performed by the string-section-indexing-handler + // after indexing all hits in an entire document, rather than after each + // AddHits() operation. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total + // Doc 0 + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 1 + Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 2 + Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 3 + Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + + // Create terms + // Foo + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + // Bar + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + // Baz + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t baz_tvi, + lite_index->InsertTerm("baz", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t baz_term_id, + term_id_codec_->EncodeTvi(baz_tvi, TviType::LITE)); + // Qux + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t qux_tvi, + lite_index->InsertTerm("qux", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t qux_term_id, + term_id_codec_->EncodeTvi(qux_tvi, TviType::LITE)); + + // Add 14 hits and make sure that termIds are added in unsorted order. + // Documents should be inserted in order as new incoming hits should have + // larger document ids. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc0_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc0_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc2_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc2_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc3_hit1)); + // Verify that the HitBuffer has not been sorted. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + + // We now have the following in the hit buffer: + // : {(docId, sectionId)...} + // foo: {(0, 0); (1, 0); (1, 1); (2, 0); (2, 2); (3, 0)} + // bar: {(0, 0); (1, 0); (1, 2)} + // baz: {(0, 1); (2, 0); (3, 0)} + // quz: {(0, 2); (2, 1)} + + // Search over the HitBuffer. + std::vector hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(4)); + // Check that hits are retrieved in descending order of docIds. + EXPECT_THAT(hits1[0].document_id(), Eq(3)); + EXPECT_THAT(hits1[0].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits1[1].document_id(), Eq(2)); + EXPECT_THAT(hits1[1].hit_section_ids_mask(), Eq(0b101)); + EXPECT_THAT(hits1[2].document_id(), Eq(1)); + EXPECT_THAT(hits1[2].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[3].document_id(), Eq(0)); + EXPECT_THAT(hits1[3].hit_section_ids_mask(), Eq(0b1)); + + std::vector hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + &always_false_suggestion_result_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); + + std::vector hits3; + lite_index->FetchHits( + bar_term_id, 0b1, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits3); + EXPECT_THAT(hits3, SizeIs(2)); + // Check fetching hits with SectionIdMask. + EXPECT_THAT(hits3[0].document_id(), Eq(1)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits3[1].document_id(), Eq(0)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + + // Check that the HitBuffer is sorted after the query call. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); +} + +TEST_F( + LiteIndexTest, + LiteIndexFetchHits_sortAtIndexing_unsortedHitsExceedingSortAtIndexThreshold) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total + // Doc 0 + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 1 + Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 2 + Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 3 + Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit2(/*section_id=*/1, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc3_hit3(/*section_id=*/2, /*document_id=*/3, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + // Doc 4 + Hit doc4_hit0(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit1(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit2(/*section_id=*/1, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc4_hit3(/*section_id=*/2, /*document_id=*/4, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + + // Create terms + // Foo + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + // Bar + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t bar_tvi, + lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id, + term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE)); + // Baz + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t baz_tvi, + lite_index->InsertTerm("baz", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t baz_term_id, + term_id_codec_->EncodeTvi(baz_tvi, TviType::LITE)); + // Qux + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t qux_tvi, + lite_index->InsertTerm("qux", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t qux_term_id, + term_id_codec_->EncodeTvi(qux_tvi, TviType::LITE)); + + // Add hits and make sure that termIds are added in unsorted order. + // Documents should be inserted in order as new incoming hits should have + // larger document ids. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc0_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc0_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit3)); + // Adding 8 hits exceeds the sort threshold. However when sort_at_indexing is + // enabled, sorting is done in the string-section-indexing-handler rather than + // AddHit() itself, we need to invoke SortHits() manually. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + lite_index->SortHits(); + // Check that the HitBuffer is sorted. + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + LiteIndex_HeaderImpl::HeaderData header_data; + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); + + // Add 12 more hits so that sort threshold is exceeded again. + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc2_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc2_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc3_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc3_hit3)); + ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc4_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc4_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc4_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc4_hit3)); + + // Adding these hits exceeds the sort threshold. However when sort_at_indexing + // is enabled, sorting is done in the string-section-indexing-handler rather + // than AddHit() itself. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue()); + + // We now have the following in the hit buffer: + // : {(docId, sectionId)...} + // foo: {(0, 0); (1, 0); (1, 1); (2, 0); (2, 2); (3, 0); (3, 1); (4, 1)} + // bar: {(0, 0); (1, 0); (1, 2); (3, 2); (4, 2)} + // baz: {(0, 1); (2, 0); (3, 0); (4, 0)} + // quz: {(0, 2); (2, 1); (4, 0)} + + // Search over the HitBuffer. + std::vector hits1; + lite_index->FetchHits( + foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(5)); + // Check that hits are retrieved in descending order of docIds. + EXPECT_THAT(hits1[0].document_id(), Eq(4)); + EXPECT_THAT(hits1[0].hit_section_ids_mask(), Eq(0b10)); + EXPECT_THAT(hits1[1].document_id(), Eq(3)); + EXPECT_THAT(hits1[1].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[2].document_id(), Eq(2)); + EXPECT_THAT(hits1[2].hit_section_ids_mask(), Eq(0b101)); + EXPECT_THAT(hits1[3].document_id(), Eq(1)); + EXPECT_THAT(hits1[3].hit_section_ids_mask(), Eq(0b11)); + EXPECT_THAT(hits1[4].document_id(), Eq(0)); + EXPECT_THAT(hits1[4].hit_section_ids_mask(), Eq(0b1)); + + std::vector hits2; + AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker; + lite_index->FetchHits( foo_term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false, SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, @@ -104,13 +575,119 @@ TEST_F(LiteIndexTest, LiteIndexAppendHits) { // Check that no hits are returned because they get skipped by the namespace // checker. EXPECT_THAT(hits2, IsEmpty()); + + std::vector hits3; + lite_index->FetchHits( + bar_term_id, 0b1, + /*only_from_prefix_sections=*/false, + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT, + /*namespace_checker=*/nullptr, &hits3); + EXPECT_THAT(hits3, SizeIs(2)); + // Check fetching hits with SectionIdMask. + EXPECT_THAT(hits3[0].document_id(), Eq(1)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + EXPECT_THAT(hits3[1].document_id(), Eq(0)); + EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1)); + + // Check that the HitBuffer is sorted after the query call. FetchHits should + // sort before performing binary search if the HitBuffer unsorted size exceeds + // the sort threshold. Regardless of the sort_at_indexing config. + EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse()); + ASSERT_THAT(lite_index->PersistToDisk(), IsOk()); + ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(), + &header_data, sizeof(header_data), + LiteIndex::kHeaderFileOffset)); + EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0)); } TEST_F(LiteIndexTest, LiteIndexIterator) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + + const std::string term = "foo"; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, + /*is_in_prefix_section=*/false); + Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5, + /*is_in_prefix_section=*/false); + SectionIdMask doc0_section_id_mask = 0b11; + std::unordered_map + expected_section_ids_tf_map0 = {{0, 3}, {1, 5}}; + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1)); + + Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7, + /*is_in_prefix_section=*/false); + Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11, + /*is_in_prefix_section=*/false); + SectionIdMask doc1_section_id_mask = 0b110; + std::unordered_map + expected_section_ids_tf_map1 = {{1, 7}, {2, 11}}; + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); + + std::unique_ptr iter = + std::make_unique( + term_id_codec_.get(), lite_index.get(), term, /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + /*need_hit_term_frequency=*/true); + + ASSERT_THAT(iter->Advance(), IsOk()); + EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(1)); + EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(), + Eq(doc1_section_id_mask)); + + std::vector matched_terms_stats; + iter->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + term, expected_section_ids_tf_map1))); + + ASSERT_THAT(iter->Advance(), IsOk()); + EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(0)); + EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(), + Eq(doc0_section_id_mask)); + matched_terms_stats.clear(); + iter->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + term, expected_section_ids_tf_map0))); +} + +TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) { + // Set up LiteIndex and TermIdCodec + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs. + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/false, + /*hit_buffer_sort_threshold_bytes=*/64); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr lite_index, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + const std::string term = "foo"; ICING_ASSERT_OK_AND_ASSIGN( uint32_t tvi, - lite_index_->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); + lite_index->InsertTerm(term, TermMatchType::PREFIX, kNamespace0)); ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3, @@ -120,8 +697,8 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { SectionIdMask doc0_section_id_mask = 0b11; std::unordered_map expected_section_ids_tf_map0 = {{0, 3}, {1, 5}}; - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc0_hit0)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc0_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1)); Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7, /*is_in_prefix_section=*/false); @@ -130,12 +707,12 @@ TEST_F(LiteIndexTest, LiteIndexIterator) { SectionIdMask doc1_section_id_mask = 0b110; std::unordered_map expected_section_ids_tf_map1 = {{1, 7}, {2, 11}}; - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit1)); - ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit2)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit1)); + ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2)); std::unique_ptr iter = std::make_unique( - term_id_codec_.get(), lite_index_.get(), term, /*term_start_index=*/0, + term_id_codec_.get(), lite_index.get(), term, /*term_start_index=*/0, /*unnormalized_term_length=*/0, kSectionIdMaskAll, /*need_hit_term_frequency=*/true); diff --git a/icing/index/lite/lite-index_thread-safety_test.cc b/icing/index/lite/lite-index_thread-safety_test.cc index 7711f92..53aa6cd 100644 --- a/icing/index/lite/lite-index_thread-safety_test.cc +++ b/icing/index/lite/lite-index_thread-safety_test.cc @@ -19,12 +19,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/index/lite/doc-hit-info-iterator-term-lite.h" #include "icing/index/lite/lite-index.h" #include "icing/index/term-id-codec.h" #include "icing/schema/section.h" -#include "icing/store/suggestion-result-checker.h" -#include "icing/testing/always-false-suggestion-result-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -52,7 +49,9 @@ class LiteIndexThreadSafetyTest : public testing::Test { std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx-thread-safety.index"; LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/64); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h index 61ec502..82bd010 100644 --- a/icing/index/lite/term-id-hit-pair.h +++ b/icing/index/lite/term-id-hit-pair.h @@ -73,6 +73,8 @@ class TermIdHitPair { return value_ == rhs.value_; } + bool operator<(const TermIdHitPair& rhs) const { return value_ < rhs.value_; } + private: Value value_; }; diff --git a/icing/index/main/main-index-merger_test.cc b/icing/index/main/main-index-merger_test.cc index 8a2f691..37e14fc 100644 --- a/icing/index/main/main-index-merger_test.cc +++ b/icing/index/main/main-index-merger_test.cc @@ -45,7 +45,9 @@ class MainIndexMergerTest : public testing::Test { std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index c59a3c8..fa96e6c 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -91,7 +91,9 @@ class MainIndexTest : public testing::Test { std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; LiteIndex::Options options(lite_index_file_name, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); @@ -362,7 +364,9 @@ TEST_F(MainIndexTest, MergeIndexToPreexisting) { // - Doc4 {"four", "foul" is_in_prefix_section=true} std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2"; LiteIndex::Options options(lite_index_file_name2, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( @@ -632,7 +636,9 @@ TEST_F(MainIndexTest, MergeIndexBackfilling) { // - Doc1 {"foot" is_in_prefix_section=false} std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2"; LiteIndex::Options options(lite_index_file_name2, - /*hit_buffer_want_merge_bytes=*/1024 * 1024); + /*hit_buffer_want_merge_bytes=*/1024 * 1024, + /*hit_buffer_sort_at_indexing=*/true, + /*hit_buffer_sort_threshold_bytes=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN(lite_index_, LiteIndex::Create(options, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/index/string-section-indexing-handler.cc b/icing/index/string-section-indexing-handler.cc index 69b8889..f5e06ad 100644 --- a/icing/index/string-section-indexing-handler.cc +++ b/icing/index/string-section-indexing-handler.cc @@ -122,6 +122,17 @@ libtextclassifier3::Status StringSectionIndexingHandler::Handle( } } + // Check and sort the LiteIndex HitBuffer if we're successful. + if (status.ok() && index_.LiteIndexNeedSort()) { + std::unique_ptr sort_timer = clock_.GetNewTimer(); + index_.SortLiteIndex(); + + if (put_document_stats != nullptr) { + put_document_stats->set_lite_index_sort_latency_ms( + sort_timer->GetElapsedMilliseconds()); + } + } + if (put_document_stats != nullptr) { put_document_stats->set_term_index_latency_ms( index_timer->GetElapsedMilliseconds()); diff --git a/icing/index/string-section-indexing-handler_test.cc b/icing/index/string-section-indexing-handler_test.cc new file mode 100644 index 0000000..2c7f5e3 --- /dev/null +++ b/icing/index/string-section-indexing-handler_test.cc @@ -0,0 +1,587 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/string-section-indexing-handler.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/hit/hit.h" +#include "icing/index/index.h" +#include "icing/index/iterator/doc-hit-info-iterator-test-util.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/portable/platform.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/document_wrapper.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" +#include "icing/schema/schema-store.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" +#include "icing/testing/test-data.h" +#include "icing/testing/tmp-directory.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" +#include "icing/transform/normalizer-factory.h" +#include "icing/transform/normalizer.h" +#include "icing/util/tokenized-document.h" +#include "unicode/uloc.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::IsFalse; +using ::testing::IsTrue; +using ::testing::Test; + +// Schema type with indexable properties and section Id. +// Section Id is determined by the lexicographical order of indexable property +// path. +// Section id = 0: body +// Section id = 1: title +constexpr std::string_view kFakeType = "FakeType"; +constexpr std::string_view kPropertyBody = "body"; +constexpr std::string_view kPropertyTitle = "title"; + +constexpr SectionId kSectionIdBody = 0; +constexpr SectionId kSectionIdTitle = 1; + +// Schema type with nested indexable properties and section Id. +// Section id = 0: "name" +// Section id = 1: "nested.body" +// Section id = 3: "nested.title" +// Section id = 4: "subject" +constexpr std::string_view kNestedType = "NestedType"; +constexpr std::string_view kPropertyName = "name"; +constexpr std::string_view kPropertyNestedDoc = "nested"; +constexpr std::string_view kPropertySubject = "subject"; + +constexpr SectionId kSectionIdNestedBody = 1; + +class StringSectionIndexingHandlerTest : public Test { + protected: + void SetUp() override { + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + base_dir_ = GetTestTempDir() + "/icing_test"; + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), + IsTrue()); + + index_dir_ = base_dir_ + "/index"; + schema_store_dir_ = base_dir_ + "/schema_store"; + document_store_dir_ = base_dir_ + "/document_store"; + + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(segmenter_options))); + + ICING_ASSERT_OK_AND_ASSIGN( + normalizer_, + normalizer_factory::Create( + /*max_term_byte_size=*/std::numeric_limits::max())); + + ASSERT_THAT( + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()), + IsTrue()); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType(kFakeType) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyTitle) + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyBody) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kNestedType) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyNestedDoc) + .SetDataTypeDocument( + kFakeType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertySubject) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyName) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/false, + /*allow_circular_schema_definitions=*/false)); + + ASSERT_TRUE( + filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult doc_store_create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, + /*pre_mapping_fbv=*/false, + /*use_persistent_hash_map=*/false, + PortableFileBackedProtoLog< + DocumentWrapper>::kDeflateCompressionLevel, + /*initialize_stats=*/nullptr)); + document_store_ = std::move(doc_store_create_result.document_store); + } + + void TearDown() override { + document_store_.reset(); + schema_store_.reset(); + normalizer_.reset(); + lang_segmenter_.reset(); + + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + FakeClock fake_clock_; + std::string base_dir_; + std::string index_dir_; + std::string schema_store_dir_; + std::string document_store_dir_; + + std::unique_ptr lang_segmenter_; + std::unique_ptr normalizer_; + std::unique_ptr schema_store_; + std::unique_ptr document_store_; +}; + +std::vector GetHits(std::unique_ptr iterator) { + std::vector infos; + while (iterator->Advance().ok()) { + infos.push_back(iterator->doc_hit_info()); + } + return infos; +} + +std::vector GetHitsWithTermFrequency( + std::unique_ptr iterator) { + std::vector infos; + while (iterator->Advance().ok()) { + std::vector matched_terms_stats; + iterator->PopulateMatchedTermsStats(&matched_terms_stats); + for (const TermMatchInfo& term_match_info : matched_terms_stats) { + infos.push_back(DocHitInfoTermFrequencyPair( + iterator->doc_hit_info(), term_match_info.term_frequencies)); + } + } + return infos; +} + +TEST_F(StringSectionIndexingHandlerTest, + HandleIntoLiteIndex_sortInIndexingNotTriggered) { + Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "foo") + .AddStringProperty(std::string(kPropertyBody), "foo bar baz") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document))); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id, + document_store_->Put(tokenized_document.document())); + + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr handler, + StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), + index.get())); + EXPECT_THAT( + handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + + EXPECT_THAT(index->last_added_document_id(), Eq(document_id)); + + // Query 'foo' + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr itr, + index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + std::vector hits = + GetHitsWithTermFrequency(std::move(itr)); + std::unordered_map expected_map{ + {kSectionIdTitle, 1}, {kSectionIdBody, 1}}; + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + document_id, expected_map))); + + // Query 'foo' with sectionId mask that masks all results + ICING_ASSERT_OK_AND_ASSIGN( + itr, index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, 1U << 2, + TermMatchType::EXACT_ONLY)); + EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); +} + +TEST_F(StringSectionIndexingHandlerTest, + HandleIntoLiteIndex_sortInIndexingTriggered) { + // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the + // HitBuffer after inserting 8 hits + Index::Options options(index_dir_, + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + DocumentProto document0 = + DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "foo foo foo") + .AddStringProperty(std::string(kPropertyBody), "foo bar baz") + .Build(); + DocumentProto document1 = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "bar baz baz") + .AddStringProperty(std::string(kPropertyBody), "foo foo baz") + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("icing", "nested_type/0") + .SetSchema(std::string(kNestedType)) + .AddDocumentProperty(std::string(kPropertyNestedDoc), document1) + .AddStringProperty(std::string(kPropertyName), "qux") + .AddStringProperty(std::string(kPropertySubject), "bar bar") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document0, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document0))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id0, + document_store_->Put(tokenized_document0.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document1, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document1))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + document_store_->Put(tokenized_document1.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document2, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document2))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + document_store_->Put(tokenized_document2.document())); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr handler, + StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), + index.get())); + + // Handle doc0 and doc1. The LiteIndex should sort and merge after adding + // these + EXPECT_THAT(handler->Handle(tokenized_document0, document_id0, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document1, document_id1, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(document_id1)); + EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse()); + + // Handle doc2. The LiteIndex should have an unsorted portion after adding + EXPECT_THAT(handler->Handle(tokenized_document2, document_id2, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(document_id2)); + + // Hits in the hit buffer: + // : {(docId, sectionId, term_freq)...} + // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1); + // (1, kSectionIdBody, 2); + // (2, kSectionIdNestedBody, 2)} + // bar: {(0, kSectionIdBody, 1); + // (1, kSectionIdTitle, 1); + // (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)} + // baz: {(0, kSectionIdBody, 1); + // (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1), + // (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)} + // qux: {(2, kSectionIdName, 1)} + + // Query 'foo' + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr itr, + index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + + // Advance the iterator and verify that we're returning hits in the correct + // order (i.e. in descending order of DocId) + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdNestedBody)); + std::vector matched_terms_stats; + std::unordered_map + expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}}; + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map2))); + + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdBody)); + std::unordered_map + expected_section_ids_tf_map1 = {{kSectionIdBody, 2}}; + matched_terms_stats.clear(); + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map1))); + + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdTitle | 1U << kSectionIdBody)); + std::unordered_map + expected_section_ids_tf_map0 = {{kSectionIdTitle, 3}, + {kSectionIdBody, 1}}; + matched_terms_stats.clear(); + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map0))); +} + +TEST_F(StringSectionIndexingHandlerTest, + HandleIntoLiteIndex_enableSortInIndexing) { + // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the + // HitBuffer after inserting 8 hits + Index::Options options(index_dir_, + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/false, + /*lite_index_sort_size=*/64); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr index, + Index::Create(options, &filesystem_, &icing_filesystem_)); + + DocumentProto document0 = + DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "foo foo foo") + .AddStringProperty(std::string(kPropertyBody), "foo bar baz") + .Build(); + DocumentProto document1 = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyTitle), "bar baz baz") + .AddStringProperty(std::string(kPropertyBody), "foo foo baz") + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("icing", "nested_type/0") + .SetSchema(std::string(kNestedType)) + .AddDocumentProperty(std::string(kPropertyNestedDoc), document1) + .AddStringProperty(std::string(kPropertyName), "qux") + .AddStringProperty(std::string(kPropertySubject), "bar bar") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document0, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document0))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id0, + document_store_->Put(tokenized_document0.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document1, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document1))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + document_store_->Put(tokenized_document1.document())); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document2, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + std::move(document2))); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + document_store_->Put(tokenized_document2.document())); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr handler, + StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), + index.get())); + + // Handle all docs + EXPECT_THAT(handler->Handle(tokenized_document0, document_id0, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document1, document_id1, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(handler->Handle(tokenized_document2, document_id2, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(document_id2)); + + // We've disabled sorting during indexing so the HitBuffer's unsorted section + // should exceed the sort threshold. PersistToDisk and reinitialize the + // LiteIndex with sort_at_indexing=true. + ASSERT_THAT(index->PersistToDisk(), IsOk()); + options = Index::Options(index_dir_, + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/64); + ICING_ASSERT_OK_AND_ASSIGN( + index, Index::Create(options, &filesystem_, &icing_filesystem_)); + + // Verify that the HitBuffer has been sorted after initializing with + // sort_at_indexing enabled. + EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse()); + + // Hits in the hit buffer: + // : {(docId, sectionId, term_freq)...} + // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1); + // (1, kSectionIdBody, 2); + // (2, kSectionIdNestedBody, 2)} + // bar: {(0, kSectionIdBody, 1); + // (1, kSectionIdTitle, 1); + // (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)} + // baz: {(0, kSectionIdBody, 1); + // (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1), + // (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)} + // qux: {(2, kSectionIdName, 1)} + + // Query 'foo' + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr itr, + index->GetIterator("foo", /*term_start_index=*/0, + /*unnormalized_term_length=*/0, kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + + // Advance the iterator and verify that we're returning hits in the correct + // order (i.e. in descending order of DocId) + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdNestedBody)); + std::vector matched_terms_stats; + std::unordered_map + expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}}; + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map2))); + + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdBody)); + std::unordered_map + expected_section_ids_tf_map1 = {{kSectionIdBody, 2}}; + matched_terms_stats.clear(); + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map1))); + + ASSERT_THAT(itr->Advance(), IsOk()); + EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0)); + EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(), + Eq(1U << kSectionIdTitle | 1U << kSectionIdBody)); + std::unordered_map + expected_section_ids_tf_map0 = {{kSectionIdTitle, 3}, + {kSectionIdBody, 1}}; + matched_terms_stats.clear(); + itr->PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo( + "foo", expected_section_ids_tf_map0))); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/query/advanced_query_parser/query-visitor_test.cc b/icing/query/advanced_query_parser/query-visitor_test.cc index d118691..59e924d 100644 --- a/icing/query/advanced_query_parser/query-visitor_test.cc +++ b/icing/query/advanced_query_parser/query-visitor_test.cc @@ -125,7 +125,9 @@ class QueryVisitorTest : public ::testing::TestWithParam { document_store_ = std::move(create_result.document_store); Index::Options options(index_dir_.c_str(), - /*index_merge_size=*/1024 * 1024); + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index 626f352..025e8e6 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -81,7 +81,9 @@ void AddTokenToIndex(Index* index, DocumentId document_id, SectionId section_id, std::unique_ptr CreateIndex(const IcingFilesystem& icing_filesystem, const Filesystem& filesystem, const std::string& index_dir) { - Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10); + Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); return Index::Create(options, &filesystem, &icing_filesystem).ValueOrDie(); } diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index fa70de1..e64de32 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -113,7 +113,9 @@ class QueryProcessorTest document_store_ = std::move(create_result.document_store); Index::Options options(index_dir_, - /*index_merge_size=*/1024 * 1024); + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); // TODO(b/249829533): switch to use persistent numeric index. diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc index 49f8d43..9f9094d 100644 --- a/icing/query/suggestion-processor_test.cc +++ b/icing/query/suggestion-processor_test.cc @@ -96,7 +96,9 @@ class SuggestionProcessorTest : public Test { document_store_ = std::move(create_result.document_store); Index::Options options(index_dir_, - /*index_merge_size=*/1024 * 1024); + /*index_merge_size=*/1024 * 1024, + /*lite_index_sort_at_indexing=*/true, + /*lite_index_sort_size=*/1024 * 8); ICING_ASSERT_OK_AND_ASSIGN( index_, Index::Create(options, &filesystem_, &icing_filesystem_)); // TODO(b/249829533): switch to use persistent numeric index. diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index 4390b6a..af6feda 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -280,8 +280,6 @@ libtextclassifier3::Status CalculateTransitiveNestedTypeRelations( // 4. "adjacent" has been fully expanded. Add all of its transitive // outgoing relations to this type's transitive outgoing relations. auto adjacent_expanded_itr = expanded_nested_types_map->find(adjacent_type); - expanded_relations.reserve(expanded_relations.size() + - adjacent_expanded_itr->second.size()); for (const auto& [transitive_reachable, _] : adjacent_expanded_itr->second) { // Insert a transitive reachable node `transitive_reachable` for `type` if @@ -345,8 +343,6 @@ libtextclassifier3::Status CalculateAcyclicTransitiveRelations( // 3. "adjacent" has been fully expanded. Add all of its transitive outgoing // relations to this type's transitive outgoing relations. auto adjacent_expanded_itr = expanded_relation_map->find(adjacent); - expanded_relations.reserve(expanded_relations.size() + - adjacent_expanded_itr->second.size()); for (const auto& [transitive_reachable, _] : adjacent_expanded_itr->second) { // Insert a transitive reachable node `transitive_reachable` for `type`. @@ -526,7 +522,6 @@ BuildTransitiveDependentGraph(const SchemaProto& schema, // Insert the parent_type into the dependent map if it is not present // already. merged_dependent_map.insert({parent_type, {}}); - merged_dependent_map[parent_type].reserve(inheritance_relation.size()); for (const auto& [child_type, _] : inheritance_relation) { // Insert the child_type into parent_type's dependent map if it's not // present already, in which case the value will be an empty vector. diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto index 507e745..958767b 100644 --- a/proto/icing/proto/initialize.proto +++ b/proto/icing/proto/initialize.proto @@ -23,7 +23,7 @@ option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; option objc_class_prefix = "ICNG"; -// Next tag: 12 +// Next tag: 14 message IcingSearchEngineOptions { // Directory to persist files for Icing. Required. // If Icing was previously initialized with this directory, it will reload @@ -107,6 +107,26 @@ message IcingSearchEngineOptions { // Integer index bucket split threshold. optional int32 integer_index_bucket_split_threshold = 11 [default = 65536]; + // Whether Icing should sort and merge its lite index HitBuffer unsorted tail + // at indexing time. + // + // If set to true, the HitBuffer will be sorted at indexing time after + // exceeding the sort threshold. If false, the HifBuffer will be sorted at + // querying time, before the first query after inserting new elements into the + // HitBuffer. + // + // The default value is false. + optional bool lite_index_sort_at_indexing = 12; + + // Size (in bytes) at which Icing's lite index should sort and merge the + // HitBuffer's unsorted tail into the sorted head for sorting at indexing + // time. Size specified here is the maximum byte size to allow for the + // unsorted tail section. + // + // Setting a lower sort size reduces querying latency at the expense of + // indexing latency. + optional int32 lite_index_sort_size = 13 [default = 8192]; // 8 KiB + reserved 2; } diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto index ca795cd..418fc88 100644 --- a/proto/icing/proto/logging.proto +++ b/proto/icing/proto/logging.proto @@ -76,7 +76,7 @@ message InitializeStatsProto { // Time used to restore the index. optional int32 index_restoration_latency_ms = 6; - // Time used to restore the index. + // Time used to restore the schema store. optional int32 schema_store_recovery_latency_ms = 7; // Status regarding how much data is lost during the initialization. @@ -117,7 +117,7 @@ message InitializeStatsProto { } // Stats of the top-level function IcingSearchEngine::Put(). -// Next tag: 10 +// Next tag: 11 message PutDocumentStatsProto { // Overall time used for the function call. optional int32 latency_ms = 1; @@ -151,6 +151,9 @@ message PutDocumentStatsProto { // Time used to index all qualified id join strings in the document. optional int32 qualified_id_join_index_latency_ms = 9; + + // Time used to sort and merge the LiteIndex's HitBuffer. + optional int32 lite_index_sort_latency_ms = 10; } // Stats of the top-level function IcingSearchEngine::Search() and diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index e7bee73..bd3f395 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=559533396) +set(synced_AOSP_CL_number=561560020) -- cgit v1.2.3