diff options
Diffstat (limited to 'icing/monkey_test/monkey-test-generators.cc')
-rw-r--r-- | icing/monkey_test/monkey-test-generators.cc | 287 |
1 files changed, 238 insertions, 49 deletions
diff --git a/icing/monkey_test/monkey-test-generators.cc b/icing/monkey_test/monkey-test-generators.cc index 7b2ff56..0d5ad73 100644 --- a/icing/monkey_test/monkey-test-generators.cc +++ b/icing/monkey_test/monkey-test-generators.cc @@ -14,79 +14,269 @@ #include "icing/monkey_test/monkey-test-generators.h" +#include <array> +#include <cstdint> +#include <random> +#include <string> +#include <string_view> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "icing/absl_ports/str_cat.h" +#include "icing/absl_ports/str_join.h" +#include "icing/document-builder.h" +#include "icing/monkey_test/monkey-test-util.h" +#include "icing/monkey_test/monkey-tokenized-document.h" +#include "icing/proto/schema.pb.h" +#include "icing/proto/term.pb.h" +#include "icing/schema/section.h" + namespace icing { namespace lib { -SchemaProto MonkeySchemaGenerator::GenerateSchema( - int num_types, const std::vector<int>& possible_num_properties) const { +namespace { + +constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> kCardinalities = + {PropertyConfigProto::Cardinality::REPEATED, + PropertyConfigProto::Cardinality::OPTIONAL, + PropertyConfigProto::Cardinality::REQUIRED}; + +constexpr std::array<TermMatchType::Code, 3> kTermMatchTypes = { + TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX}; + +PropertyConfigProto::Cardinality::Code GetRandomCardinality( + MonkeyTestRandomEngine* random) { + std::uniform_int_distribution<> dist(0, kCardinalities.size() - 1); + return kCardinalities[dist(*random)]; +} + +TermMatchType::Code GetRandomTermMatchType(MonkeyTestRandomEngine* random) { + std::uniform_int_distribution<> dist(0, kTermMatchTypes.size() - 1); + return kTermMatchTypes[dist(*random)]; +} + +// TODO: Update this function when supporting document_indexing_config. +bool IsIndexableProperty(const PropertyConfigProto& property) { + return property.string_indexing_config().term_match_type() != + TermMatchType::UNKNOWN; +} + +void SetStringIndexingConfig(PropertyConfigProto& property, + TermMatchType::Code term_match_type) { + if (term_match_type != TermMatchType::UNKNOWN) { + StringIndexingConfig* string_indexing_config = + property.mutable_string_indexing_config(); + string_indexing_config->set_term_match_type(term_match_type); + // TODO: Try to add different TokenizerTypes. VERBATIM, RFC822, and URL are + // the remaining candidates to consider. + string_indexing_config->set_tokenizer_type( + StringIndexingConfig::TokenizerType::PLAIN); + } else { + property.clear_string_indexing_config(); + } +} + +} // namespace + +SchemaProto MonkeySchemaGenerator::GenerateSchema() { SchemaProto schema; - std::uniform_int_distribution<> dist(0, possible_num_properties.size() - 1); - while (num_types > 0) { - int num_properties = possible_num_properties[dist(*random_)]; - *schema.add_types() = GenerateType( - "MonkeyTestType" + std::to_string(num_types), num_properties); - --num_types; + for (int i = 0; i < config_->num_types; ++i) { + *schema.add_types() = GenerateType(); } return schema; } +MonkeySchemaGenerator::UpdateSchemaResult MonkeySchemaGenerator::UpdateSchema( + const SchemaProto& schema) { + UpdateSchemaResult result = {std::move(schema)}; + SchemaProto& new_schema = result.schema; + + // Delete up to 2 existing types. + std::uniform_int_distribution<> num_types_to_delete_dist(0, 2); + for (int num_types_to_delete = num_types_to_delete_dist(*random_); + num_types_to_delete >= 0; --num_types_to_delete) { + if (new_schema.types_size() > 0) { + std::uniform_int_distribution<> dist(0, new_schema.types_size() - 1); + int index_to_delete = dist(*random_); + result.schema_types_deleted.insert( + new_schema.types(index_to_delete).schema_type()); + new_schema.mutable_types()->SwapElements(index_to_delete, + new_schema.types_size() - 1); + new_schema.mutable_types()->RemoveLast(); + } + } + + // Updating about 1/3 of existing types. + for (int i = 0; i < new_schema.types_size(); ++i) { + std::uniform_int_distribution<> dist(0, 2); + if (dist(*random_) == 0) { + UpdateType(*new_schema.mutable_types(i), result); + } + } + + // Add up to 2 new types. + std::uniform_int_distribution<> num_types_to_add_dist(0, 2); + for (int num_types_to_add = num_types_to_add_dist(*random_); + num_types_to_add >= 0; --num_types_to_add) { + *new_schema.add_types() = GenerateType(); + } + + return result; +} + PropertyConfigProto MonkeySchemaGenerator::GenerateProperty( - std::string_view name, TermMatchType::Code term_match_type) const { + const SchemaTypeConfigProto& type_config, + PropertyConfigProto::Cardinality::Code cardinality, + TermMatchType::Code term_match_type) { PropertyConfigProto prop; - prop.set_property_name(name.data(), name.length()); + prop.set_property_name( + "MonkeyTestProp" + + std::to_string(num_properties_generated_[type_config.schema_type()]++)); // TODO: Perhaps in future iterations we will want to generate more than just // string properties. prop.set_data_type(PropertyConfigProto::DataType::STRING); + prop.set_cardinality(cardinality); + SetStringIndexingConfig(prop, term_match_type); + return prop; +} - constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> - cardinalities = {PropertyConfigProto::Cardinality::REPEATED, - PropertyConfigProto::Cardinality::OPTIONAL, - PropertyConfigProto::Cardinality::REQUIRED}; - std::uniform_int_distribution<> dist(0, cardinalities.size() - 1); - prop.set_cardinality(cardinalities[dist(*random_)]); +void MonkeySchemaGenerator::UpdateProperty( + const SchemaTypeConfigProto& type_config, PropertyConfigProto& property, + UpdateSchemaResult& result) { + PropertyConfigProto::Cardinality::Code new_cardinality = + GetRandomCardinality(random_); + if (new_cardinality != property.cardinality()) { + // Only do compatible cardinality update for now, otherwise it would be hard + // to track which documents will be invalid after updating the schema. + // + // The following type of updates are not allowed: + // - optional -> required + // - repeated -> optional + // - repeated -> required + if (property.cardinality() == PropertyConfigProto::Cardinality::OPTIONAL && + new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) { + return; + } + if (property.cardinality() == PropertyConfigProto::Cardinality::REPEATED && + (new_cardinality == PropertyConfigProto::Cardinality::OPTIONAL || + new_cardinality == PropertyConfigProto::Cardinality::REQUIRED)) { + return; + } + property.set_cardinality(new_cardinality); + } - if (term_match_type != TermMatchType::UNKNOWN) { - StringIndexingConfig* string_indexing_config = - prop.mutable_string_indexing_config(); - string_indexing_config->set_term_match_type(term_match_type); - string_indexing_config->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + if (property.data_type() == PropertyConfigProto::DataType::STRING) { + TermMatchType::Code new_term_match_type = GetRandomTermMatchType(random_); + if (new_term_match_type != + property.string_indexing_config().term_match_type()) { + SetStringIndexingConfig(property, new_term_match_type); + result.schema_types_index_incompatible.insert(type_config.schema_type()); + } } - return prop; } -SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType( - std::string_view name, int num_properties) const { +SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType() { SchemaTypeConfigProto type_config; - type_config.set_schema_type(name.data(), name.length()); + type_config.set_schema_type("MonkeyTestType" + + std::to_string(num_types_generated_++)); + std::uniform_int_distribution<> possible_num_properties_dist( + 0, config_->possible_num_properties.size() - 1); + int total_num_properties = + config_->possible_num_properties[possible_num_properties_dist(*random_)]; + int num_indexed_properties = 0; - constexpr std::array<TermMatchType::Code, 3> term_match_types = { - TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX}; - std::uniform_int_distribution<> dist(0, term_match_types.size() - 1); - while (--num_properties >= 0) { - std::string prop_name = "MonkeyTestProp" + std::to_string(num_properties); + for (int i = 0; i < total_num_properties; ++i) { TermMatchType::Code term_match_type = TermMatchType::UNKNOWN; if (num_indexed_properties < kTotalNumSections) { - term_match_type = term_match_types[dist(*random_)]; + term_match_type = GetRandomTermMatchType(random_); } if (term_match_type != TermMatchType::UNKNOWN) { num_indexed_properties += 1; } - (*type_config.add_properties()) = - GenerateProperty(prop_name, term_match_type); + (*type_config.add_properties()) = GenerateProperty( + type_config, GetRandomCardinality(random_), term_match_type); } return type_config; } +void MonkeySchemaGenerator::UpdateType(SchemaTypeConfigProto& type_config, + UpdateSchemaResult& result) { + // Delete up to 4 existing property. + std::uniform_int_distribution<> num_properties_to_delete_dist(0, 4); + for (int num_properties_to_delete = num_properties_to_delete_dist(*random_); + num_properties_to_delete >= 0; --num_properties_to_delete) { + if (type_config.properties_size() > 0) { + std::uniform_int_distribution<> dist(0, + type_config.properties_size() - 1); + int index_to_delete = dist(*random_); + // Only delete a required property for now, otherwise it would be hard + // to track which documents will be invalid after updating the schema. + if (type_config.properties(index_to_delete).cardinality() != + PropertyConfigProto::Cardinality::REQUIRED) { + continue; + } + if (IsIndexableProperty(type_config.properties(index_to_delete))) { + result.schema_types_index_incompatible.insert( + type_config.schema_type()); + } + // Removing a property will cause the type to be considered as + // incompatible. + result.schema_types_incompatible.insert(type_config.schema_type()); + + type_config.mutable_properties()->SwapElements( + index_to_delete, type_config.properties_size() - 1); + type_config.mutable_properties()->RemoveLast(); + } + } + + // Updating about 1/3 of existing properties. + for (int i = 0; i < type_config.properties_size(); ++i) { + std::uniform_int_distribution<> dist(0, 2); + if (dist(*random_) == 0) { + UpdateProperty(type_config, *type_config.mutable_properties(i), result); + } + } + + // Add up to 4 new properties. + std::uniform_int_distribution<> num_types_to_add_dist(0, 4); + for (int num_types_to_add = num_types_to_add_dist(*random_); + num_types_to_add >= 0; --num_types_to_add) { + PropertyConfigProto::Cardinality::Code new_cardinality = + GetRandomCardinality(random_); + // Adding a required property will make all document of this type invalid. + if (new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) { + result.schema_types_incompatible.insert(type_config.schema_type()); + } + PropertyConfigProto new_property = GenerateProperty( + type_config, new_cardinality, GetRandomTermMatchType(random_)); + if (IsIndexableProperty(new_property)) { + result.schema_types_index_incompatible.insert(type_config.schema_type()); + } + (*type_config.add_properties()) = std::move(new_property); + } + + int num_indexed_properties = 0; + for (int i = 0; i < type_config.properties_size(); ++i) { + if (IsIndexableProperty(type_config.properties(i))) { + ++num_indexed_properties; + } + } + + if (num_indexed_properties > kTotalNumSections) { + result.is_invalid_schema = true; + } +} + std::string MonkeyDocumentGenerator::GetNamespace() const { uint32_t name_space; // When num_namespaces is 0, all documents generated get different namespaces. // Otherwise, namespaces will be randomly picked from a set with // num_namespaces elements. - if (num_namespaces_ == 0) { + if (config_->num_namespaces == 0) { name_space = num_docs_generated_; } else { - std::uniform_int_distribution<> dist(0, num_namespaces_ - 1); + std::uniform_int_distribution<> dist(0, config_->num_namespaces - 1); name_space = dist(*random_); } return absl_ports::StrCat("namespace", std::to_string(name_space)); @@ -96,18 +286,19 @@ std::string MonkeyDocumentGenerator::GetUri() const { uint32_t uri; // When num_uris is 0, all documents generated get different URIs. Otherwise, // URIs will be randomly picked from a set with num_uris elements. - if (num_uris_ == 0) { + if (config_->num_uris == 0) { uri = num_docs_generated_; } else { - std::uniform_int_distribution<> dist(0, num_uris_ - 1); + std::uniform_int_distribution<> dist(0, config_->num_uris - 1); uri = dist(*random_); } return absl_ports::StrCat("uri", std::to_string(uri)); } int MonkeyDocumentGenerator::GetNumTokens() const { - std::uniform_int_distribution<> dist(0, possible_num_tokens_.size() - 1); - int n = possible_num_tokens_[dist(*random_)]; + std::uniform_int_distribution<> dist( + 0, config_->possible_num_tokens_.size() - 1); + int n = config_->possible_num_tokens_[dist(*random_)]; // Add some noise std::uniform_real_distribution<> real_dist(0.5, 1); float p = real_dist(*random_); @@ -138,15 +329,13 @@ MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() { std::vector<std::string> prop_content = GetPropertyContent(); doc_builder.AddStringProperty(prop.property_name(), absl_ports::StrJoin(prop_content, " ")); - // Create a tokenized section if the current property is indexable. - if (prop.data_type() == PropertyConfigProto::DataType::STRING && - prop.string_indexing_config().term_match_type() != - TermMatchType::UNKNOWN) { - MonkeyTokenizedSection section = { - prop.property_name(), prop.string_indexing_config().term_match_type(), - std::move(prop_content)}; - document.tokenized_sections.push_back(std::move(section)); - } + // No matter whether the property is indexable currently, we have to create + // a section for it since a non-indexable property can become indexable + // after a schema type change. The in-memory icing will automatically skip + // sections that are non-indexable at the time of search requests. + MonkeyTokenizedSection section = {prop.property_name(), + std::move(prop_content)}; + document.tokenized_sections.push_back(std::move(section)); } document.document = doc_builder.Build(); ++num_docs_generated_; |