aboutsummaryrefslogtreecommitdiff
path: root/icing/scoring/section-weights_test.cc
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2021-10-21 16:01:05 -0700
committerTim Barron <tjbarron@google.com>2021-10-21 16:01:05 -0700
commitda1b8986e7c873efa45529b8adc4a32490eb9c3c (patch)
tree1cc9dbe185e88e71c7c82ede8ba02578a36ef78f /icing/scoring/section-weights_test.cc
parent8555f998fccca3aea3f6f67d44fce04775ddea97 (diff)
downloadicing-da1b8986e7c873efa45529b8adc4a32490eb9c3c.tar.gz
Sync from upstream.
Descriptions: ================ Replace refs to c lib headers w/ c++ stdlib equivalents. ================ Update IDF component of BM25F Calculator in IcingLib ================ Expose QuerySuggestions API. ================ Change the tokenizer used in QuerySuggest. ================ Add SectionWeights API to Icing. ================ Apply SectionWeights to BM25F Scoring. ================ Replaces uses of u_strTo/FromUTF32 w/ u_strTo/FromUTF8. Bug: 152934343 Bug: 202308641 Bug: 203700301 Change-Id: Ic884a84e5ff4c9c04b2cd6dd1fce90765aa4446e
Diffstat (limited to 'icing/scoring/section-weights_test.cc')
-rw-r--r--icing/scoring/section-weights_test.cc386
1 files changed, 386 insertions, 0 deletions
diff --git a/icing/scoring/section-weights_test.cc b/icing/scoring/section-weights_test.cc
new file mode 100644
index 0000000..b90c3d5
--- /dev/null
+++ b/icing/scoring/section-weights_test.cc
@@ -0,0 +1,386 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/section-weights.h"
+
+#include <cfloat>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+
+class SectionWeightsTest : public testing::Test {
+ protected:
+ SectionWeightsTest()
+ : test_dir_(GetTestTempDir() + "/icing"),
+ schema_store_dir_(test_dir_ + "/schema_store") {}
+
+ void SetUp() override {
+ // Creates file directories
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+
+ SchemaTypeConfigProto sender_schema =
+ SchemaTypeConfigBuilder()
+ .SetType("sender")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(
+ PropertyConfigProto_Cardinality_Code_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto email_schema =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetDataType(PropertyConfigProto_DataType_Code_STRING)
+ .SetCardinality(
+ PropertyConfigProto_Cardinality_Code_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetDataType(PropertyConfigProto_DataType_Code_STRING)
+ .SetCardinality(
+ PropertyConfigProto_Cardinality_Code_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "sender", /*index_nested_properties=*/true)
+ .SetCardinality(
+ PropertyConfigProto_Cardinality_Code_OPTIONAL))
+ .Build();
+ SchemaProto schema =
+ SchemaBuilder().AddType(sender_schema).AddType(email_schema).Build();
+
+ ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ }
+
+ void TearDown() override {
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ SchemaStore *schema_store() { return schema_store_.get(); }
+
+ private:
+ const std::string test_dir_;
+ const std::string schema_store_dir_;
+ Filesystem filesystem_;
+ FakeClock fake_clock_;
+ std::unique_ptr<SchemaStore> schema_store_;
+};
+
+TEST_F(SectionWeightsTest, ShouldNormalizeSinglePropertyWeight) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("sender");
+
+ PropertyWeight *property_weight =
+ type_property_weights->add_property_weights();
+ property_weight->set_weight(5.0);
+ property_weight->set_path("name");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
+ schema_store()->GetSchemaTypeId("sender"));
+
+ // section_id 0 corresponds to property "name".
+ // We expect 1.0 as there is only one property in the "sender" schema type
+ // so it should take the max normalized weight of 1.0.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
+ /*section_id=*/0),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldAcceptMaxWeightValue) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("sender");
+
+ PropertyWeight *property_weight =
+ type_property_weights->add_property_weights();
+ property_weight->set_weight(DBL_MAX);
+ property_weight->set_path("name");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
+ schema_store()->GetSchemaTypeId("sender"));
+
+ // section_id 0 corresponds to property "name".
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
+ /*section_id=*/0),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldFailWithNegativeWeights) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_propery_weight =
+ type_property_weights->add_property_weights();
+ body_propery_weight->set_weight(-100.0);
+ body_propery_weight->set_path("body");
+
+ EXPECT_THAT(SectionWeights::Create(schema_store(), spec_proto).status(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SectionWeightsTest, ShouldFailWithZeroWeight) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("sender");
+
+ PropertyWeight *property_weight =
+ type_property_weights->add_property_weights();
+ property_weight->set_weight(0.0);
+ property_weight->set_path("name");
+
+ EXPECT_THAT(SectionWeights::Create(schema_store(), spec_proto).status(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SectionWeightsTest, ShouldReturnDefaultIfTypePropertyWeightsNotSet) {
+ ScoringSpecProto spec_proto;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(kDefaultSectionWeight));
+}
+
+TEST_F(SectionWeightsTest, ShouldSetNestedPropertyWeights) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ type_property_weights->add_property_weights();
+ body_property_weight->set_weight(1.0);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(100.0);
+ subject_property_weight->set_path("subject");
+
+ PropertyWeight *nested_property_weight =
+ type_property_weights->add_property_weights();
+ nested_property_weight->set_weight(50.0);
+ nested_property_weight->set_path("sender.name");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(0.01));
+ // Normalized weight for "sender.name" property (the nested property).
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(0.5));
+ // Normalized weight for "subject" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldNormalizeIfAllWeightsBelowOne) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ type_property_weights->add_property_weights();
+ body_property_weight->set_weight(0.1);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *sender_name_weight =
+ type_property_weights->add_property_weights();
+ sender_name_weight->set_weight(0.2);
+ sender_name_weight->set_path("sender.name");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(0.4);
+ subject_property_weight->set_path("subject");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(1.0 / 4.0));
+ // Normalized weight for "sender.name" property (the nested property).
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(2.0 / 4.0));
+ // Normalized weight for "subject" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldSetNestedPropertyWeightSeparatelyForTypes) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *email_type_property_weights =
+ spec_proto.add_type_property_weights();
+ email_type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ email_type_property_weights->add_property_weights();
+ body_property_weight->set_weight(1.0);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *subject_property_weight =
+ email_type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(100.0);
+ subject_property_weight->set_path("subject");
+
+ PropertyWeight *sender_name_property_weight =
+ email_type_property_weights->add_property_weights();
+ sender_name_property_weight->set_weight(50.0);
+ sender_name_property_weight->set_path("sender.name");
+
+ TypePropertyWeights *sender_type_property_weights =
+ spec_proto.add_type_property_weights();
+ sender_type_property_weights->set_schema_type("sender");
+
+ PropertyWeight *sender_property_weight =
+ sender_type_property_weights->add_property_weights();
+ sender_property_weight->set_weight(25.0);
+ sender_property_weight->set_path("sender");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
+ schema_store()->GetSchemaTypeId("sender"));
+
+ // Normalized weight for "sender.name" property (the nested property)
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(0.5));
+ // Normalized weight for "name" property for "sender" schema type. As it is
+ // the only property of the type, it should take the max normalized weight of
+ // 1.0.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
+ /*section_id=*/2),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldSkipNonExistentPathWhenSettingWeights) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ // If this property weight isn't skipped, then the max property weight would
+ // be set to 100.0 and all weights would be normalized against the max.
+ PropertyWeight *non_valid_property_weight =
+ type_property_weights->add_property_weights();
+ non_valid_property_weight->set_weight(100.0);
+ non_valid_property_weight->set_path("sender.organization");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(10.0);
+ subject_property_weight->set_path("subject");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property. Because the weight is not explicitly
+ // set, it is set to the default of 1.0 before being normalized.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(0.1));
+ // Normalized weight for "sender.name" property (the nested property). Because
+ // the weight is not explicitly set, it is set to the default of 1.0 before
+ // being normalized.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(0.1));
+ // Normalized weight for "subject" property. Because the invalid property path
+ // is skipped when assigning weights, subject takes the max normalized weight
+ // of 1.0 instead.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(1.0));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing