diff options
author | Terry Wang <tytytyww@google.com> | 2022-09-30 12:54:54 -0700 |
---|---|---|
committer | Terry Wang <tytytyww@google.com> | 2022-09-30 15:58:03 -0700 |
commit | a570eb457c8cf5d677a2616aff0b7a7dcf72e665 (patch) | |
tree | 4f8778d18291397ace1f86cdad2fd8ca19dcdddc /icing/file | |
parent | b02eecda6a12241798cdbaaa7069d19f2fc5f41f (diff) | |
download | icing-a570eb457c8cf5d677a2616aff0b7a7dcf72e665.tar.gz |
Update Icing from upstream.
Descriptions:
======================================================================
Integrate ANTLR-based advanced query prototype with query processor.
======================================================================
[PersistentHashMap][6.1/x] Wrap the return value of KeyMapper::ComputeChecksum by StatusOr
======================================================================
[PersistentHashMap][6.0/x] Replace GetValuesToKeys with iterator for KeyMapper
======================================================================
[PersistentHashMap][5.1/x] Allow client to specify initial num buckets
======================================================================
[PersistentHashMap][5/x] Implement rehashing
======================================================================
Add SchemaType filter and Document Id filter in Search Suggestion API.
======================================================================
Follow up to cl/463377778
======================================================================
Add Document Id filters in Search Suggestion API.
======================================================================
Cleanup LSC: Replace inclusion of *_proto_portable.pb.h files coming from portable_proto_library() with *(!_proto_portable).pb.h coming from cc_proto_library().
======================================================================
Cleanup LSC: Replace inclusion of *_proto_portable.pb.h files coming from portable_proto_library() with *(!_proto_portable).pb.h coming from cc_proto_library().
======================================================================
Cloned from CL 464902284 by 'g4 patch'.
======================================================================
Cleanup LSC: Replace inclusion of *_proto_portable.pb.h files coming from portable_proto_library() with *(!_proto_portable).pb.h coming from cc_proto_library().
======================================================================
Cleanup Remove unused visibility specs (last referenced in codebase over 132 days ago).
======================================================================
Cleanup LSC: Replace inclusion of *_proto_portable.pb.h files coming from portable_proto_library() with *(!_proto_portable).pb.h coming from cc_proto_library().
======================================================================
Cleanup LSC: Replace inclusion of *_proto_portable.pb.h files coming from portable_proto_library() with *(!_proto_portable).pb.h coming from cc_proto_library().
======================================================================
Remove dsaadati@ from third_party/icing OWNERS
======================================================================
Cleanup Move package level default_copts attribute to copts.
======================================================================
Refactor QueryProcessor and QueryProcessTest in preparation for adding ANTLR prototype to parse queries with search_type EXPERIMENTAL_ICING_ADVANCED_QUERY.
======================================================================
Bug: 208654892
Bug: 230553264
Bug: 237324702
Bug: 193919210
Change-Id: I2f0a612747ccb754502489a9b168406532cffaee
Diffstat (limited to 'icing/file')
-rw-r--r-- | icing/file/file-backed-vector.h | 12 | ||||
-rw-r--r-- | icing/file/persistent-hash-map.cc | 74 | ||||
-rw-r--r-- | icing/file/persistent-hash-map.h | 23 | ||||
-rw-r--r-- | icing/file/persistent-hash-map_test.cc | 255 |
4 files changed, 297 insertions, 67 deletions
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h index 7916666..165597d 100644 --- a/icing/file/file-backed-vector.h +++ b/icing/file/file-backed-vector.h @@ -350,6 +350,12 @@ class FileBackedVector { // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; + // Updates checksum of the vector contents and returns it. + // + // Returns: + // INTERNAL_ERROR if the vector's internal state is inconsistent + libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); + // Accessors. const T* array() const { return reinterpret_cast<const T*>(mmapped_file_->region()); @@ -357,12 +363,6 @@ class FileBackedVector { int32_t num_elements() const { return header_->num_elements; } - // Updates checksum of the vector contents and returns it. - // - // Returns: - // INTERNAL_ERROR if the vector's internal state is inconsistent - libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); - public: class MutableArrayView { public: diff --git a/icing/file/persistent-hash-map.cc b/icing/file/persistent-hash-map.cc index 43530dd..8101c89 100644 --- a/icing/file/persistent-hash-map.cc +++ b/icing/file/persistent-hash-map.cc @@ -19,6 +19,7 @@ #include <memory> #include <string> #include <string_view> +#include <utility> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" @@ -184,7 +185,8 @@ std::string GetKeyValueStorageFilePath(std::string_view base_dir) { /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> PersistentHashMap::Create(const Filesystem& filesystem, std::string_view base_dir, int32_t value_type_size, - int32_t max_load_factor_percent) { + int32_t max_load_factor_percent, + int32_t init_num_buckets) { if (!filesystem.FileExists( GetMetadataFilePath(base_dir, kSubDirectory).c_str()) || !filesystem.FileExists( @@ -194,7 +196,7 @@ PersistentHashMap::Create(const Filesystem& filesystem, !filesystem.FileExists(GetKeyValueStorageFilePath(base_dir).c_str())) { // TODO: erase all files if missing any. return InitializeNewFiles(filesystem, base_dir, value_type_size, - max_load_factor_percent); + max_load_factor_percent, init_num_buckets); } return InitializeExistingFiles(filesystem, base_dir, value_type_size, max_load_factor_percent); @@ -397,7 +399,8 @@ libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeChecksum() { PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem, std::string_view base_dir, int32_t value_type_size, - int32_t max_load_factor_percent) { + int32_t max_load_factor_percent, + int32_t init_num_buckets) { // Create directory. const std::string dir_path = absl_ports::StrCat(base_dir, "/", kSubDirectory); if (!filesystem.CreateDirectoryRecursively(dir_path.c_str())) { @@ -421,8 +424,9 @@ PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem, filesystem, GetKeyValueStorageFilePath(base_dir), MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); - // Initialize one bucket. - ICING_RETURN_IF_ERROR(bucket_storage->Append(Bucket())); + // Initialize buckets. + ICING_RETURN_IF_ERROR( + bucket_storage->Set(/*idx=*/0, /*len=*/init_num_buckets, Bucket())); ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk()); // Create and initialize new info @@ -511,13 +515,16 @@ PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem, crcs_ptr->component_crcs.info_crc = info_ptr->ComputeChecksum().Get(); crcs_ptr->all_crc = crcs_ptr->component_crcs.ComputeChecksum().Get(); ICING_RETURN_IF_ERROR(metadata_mmapped_file->PersistToDisk()); - // TODO(b/193919210): rehash if needed } - return std::unique_ptr<PersistentHashMap>(new PersistentHashMap( - filesystem, base_dir, std::move(metadata_mmapped_file), - std::move(bucket_storage), std::move(entry_storage), - std::move(kv_storage))); + auto persistent_hash_map = + std::unique_ptr<PersistentHashMap>(new PersistentHashMap( + filesystem, base_dir, std::move(metadata_mmapped_file), + std::move(bucket_storage), std::move(entry_storage), + std::move(kv_storage))); + ICING_RETURN_IF_ERROR( + persistent_hash_map->RehashIfNecessary(/*force_rehash=*/false)); + return persistent_hash_map; } libtextclassifier3::StatusOr<PersistentHashMap::EntryIndexPair> @@ -594,7 +601,52 @@ libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx, Entry(new_kv_idx, mutable_bucket.Get().head_entry_index()))); mutable_bucket.Get().set_head_entry_index(new_entry_idx); - // TODO: rehash if needed + return RehashIfNecessary(/*force_rehash=*/false); +} + +libtextclassifier3::Status PersistentHashMap::RehashIfNecessary( + bool force_rehash) { + int32_t new_num_bucket = bucket_storage_->num_elements(); + while (new_num_bucket <= Bucket::kMaxNumBuckets / 2 && + size() > static_cast<int64_t>(new_num_bucket) * + info()->max_load_factor_percent / 100) { + new_num_bucket *= 2; + } + + if (!force_rehash && new_num_bucket == bucket_storage_->num_elements()) { + return libtextclassifier3::Status::OK; + } + + // Resize and reset buckets. + ICING_RETURN_IF_ERROR( + bucket_storage_->Set(0, new_num_bucket, Bucket(Entry::kInvalidIndex))); + + // Iterate all key value pairs in kv_storage, rehash and insert. + Iterator iter = GetIterator(); + int32_t entry_idx = 0; + while (iter.Advance()) { + ICING_ASSIGN_OR_RETURN(int32_t bucket_idx, + HashKeyToBucketIndex(iter.GetKey(), new_num_bucket)); + ICING_ASSIGN_OR_RETURN(FileBackedVector<Bucket>::MutableView mutable_bucket, + bucket_storage_->GetMutable(bucket_idx)); + + // Update entry and bucket. + ICING_RETURN_IF_ERROR(entry_storage_->Set( + entry_idx, + Entry(iter.GetIndex(), mutable_bucket.Get().head_entry_index()))); + mutable_bucket.Get().set_head_entry_index(entry_idx); + + ++entry_idx; + } + + // Since there will be some deleted entries, after rehashing entry_storage_ + // # of vector elements may be greater than the actual # of entries. + // Therefore, we have to truncate entry_storage_ to the correct size. + if (entry_idx < entry_storage_->num_elements()) { + entry_storage_->TruncateTo(entry_idx); + } + + info()->num_deleted_entries = 0; return libtextclassifier3::Status::OK; } diff --git a/icing/file/persistent-hash-map.h b/icing/file/persistent-hash-map.h index a1ca25d..9a9ca8d 100644 --- a/icing/file/persistent-hash-map.h +++ b/icing/file/persistent-hash-map.h @@ -49,6 +49,8 @@ class PersistentHashMap { // True on success, otherwise false. bool Advance(); + int32_t GetIndex() const { return curr_kv_idx_; } + // Get the key. // // REQUIRES: The preceding call for Advance() is true. @@ -226,7 +228,8 @@ class PersistentHashMap { "type size"); static constexpr int32_t kVersion = 1; - static constexpr int32_t kDefaultMaxLoadFactorPercent = 75; + static constexpr int32_t kDefaultMaxLoadFactorPercent = 100; + static constexpr int32_t kDefaultInitNumBuckets = 8192; static constexpr std::string_view kFilePrefix = "persistent_hash_map"; // Only metadata, bucket, entry files are stored under this sub-directory, for @@ -248,6 +251,9 @@ class PersistentHashMap { // invoked (and # of buckets will be doubled). // Note that load_factor_percent exceeding 100 is // considered valid. + // init_num_buckets: initial # of buckets for the persistent hash map. It is + // used when creating new persistent hash map and ignored + // when creating the instance from existing files. // // Returns: // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored @@ -257,7 +263,8 @@ class PersistentHashMap { static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> Create(const Filesystem& filesystem, std::string_view base_dir, int32_t value_type_size, - int32_t max_load_factor_percent = kDefaultMaxLoadFactorPercent); + int32_t max_load_factor_percent = kDefaultMaxLoadFactorPercent, + int32_t init_num_buckets = kDefaultInitNumBuckets); ~PersistentHashMap(); @@ -379,7 +386,8 @@ class PersistentHashMap { static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> InitializeNewFiles(const Filesystem& filesystem, std::string_view base_dir, - int32_t value_type_size, int32_t max_load_factor_percent); + int32_t value_type_size, int32_t max_load_factor_percent, + int32_t init_num_buckets); static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> InitializeExistingFiles(const Filesystem& filesystem, @@ -421,6 +429,15 @@ class PersistentHashMap { libtextclassifier3::Status Insert(int32_t bucket_idx, std::string_view key, const void* value); + // Rehash function. If force_rehash is true or the hash map loading is greater + // than max_load_factor, then it will rehash all keys. + // + // Returns: + // OK on success + // INTERNAL_ERROR on I/O error or any data inconsistency + // Any FileBackedVector errors + libtextclassifier3::Status RehashIfNecessary(bool force_rehash); + Crcs* crcs() { return reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + Crcs::kFileOffset); diff --git a/icing/file/persistent-hash-map_test.cc b/icing/file/persistent-hash-map_test.cc index 138320c..8024388 100644 --- a/icing/file/persistent-hash-map_test.cc +++ b/icing/file/persistent-hash-map_test.cc @@ -36,12 +36,15 @@ namespace lib { namespace { static constexpr int32_t kCorruptedValueOffset = 3; +static constexpr int32_t kTestInitNumBuckets = 1; using ::testing::Contains; using ::testing::Eq; +using ::testing::Gt; using ::testing::HasSubstr; using ::testing::IsEmpty; using ::testing::Key; +using ::testing::Lt; using ::testing::Not; using ::testing::Pair; using ::testing::Pointee; @@ -150,19 +153,52 @@ TEST_F(PersistentHashMapTest, InitializeNewFiles) { .Get())); } +TEST_F(PersistentHashMapTest, InitializeNewFilesWithCustomInitBucketSize) { + // Create new persistent hash map + int custom_init_bucket_size = 123; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + custom_init_bucket_size)); + EXPECT_THAT(persistent_hash_map->num_buckets(), Eq(custom_init_bucket_size)); +} + +TEST_F(PersistentHashMapTest, InitBucketSizeShouldNotAffectExistingFiles) { + int init_bucket_size1 = 4; + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create( + filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + init_bucket_size1)); + EXPECT_THAT(persistent_hash_map->num_buckets(), Eq(init_bucket_size1)); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + int init_bucket_size2 = 8; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + init_bucket_size2)); + // # of buckets should still be the original value. + EXPECT_THAT(persistent_hash_map->num_buckets(), Eq(init_bucket_size1)); +} + TEST_F(PersistentHashMapTest, TestInitializationFailsWithoutPersistToDiskOrDestruction) { // Create new persistent hash map - // Set max_load_factor_percent as 1000. Load factor percent is calculated as - // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of - // buckets in an empty PersistentHashMap) and a max_load_factor_percent of - // 1000, we would allow the insertion of up to 10 keys before rehashing, to - // avoid PersistToDisk being called implicitly by rehashing. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*value_type_size=*/sizeof(int))); // Put some key value pairs. ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); @@ -179,23 +215,16 @@ TEST_F(PersistentHashMapTest, // Without calling PersistToDisk, checksums will not be recomputed or synced // to disk, so initializing another instance on the same files should fail. EXPECT_THAT(PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000), + /*value_type_size=*/sizeof(int)), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_F(PersistentHashMapTest, TestInitializationSucceedsWithPersistToDisk) { // Create new persistent hash map - // Set max_load_factor_percent as 1000. Load factor percent is calculated as - // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of - // buckets in an empty PersistentHashMap) and a max_load_factor_percent of - // 1000, we would allow the insertion of up to 10 keys before rehashing, to - // avoid PersistToDisk being called implicitly by rehashing. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map1, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*value_type_size=*/sizeof(int))); // Put some key value pairs. ICING_ASSERT_OK(persistent_hash_map1->Put("a", Serialize(1).data())); @@ -217,8 +246,7 @@ TEST_F(PersistentHashMapTest, TestInitializationSucceedsWithPersistToDisk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map2, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*value_type_size=*/sizeof(int))); EXPECT_THAT(persistent_hash_map2, Pointee(SizeIs(2))); EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "a"), IsOkAndHolds(1)); EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "b"), IsOkAndHolds(2)); @@ -227,16 +255,10 @@ TEST_F(PersistentHashMapTest, TestInitializationSucceedsWithPersistToDisk) { TEST_F(PersistentHashMapTest, TestInitializationSucceedsAfterDestruction) { { // Create new persistent hash map - // Set max_load_factor_percent as 1000. Load factor percent is calculated as - // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of - // buckets in an empty PersistentHashMap) and a max_load_factor_percent of - // 1000, we would allow the insertion of up to 10 keys before rehashing, to - // avoid PersistToDisk being called implicitly by rehashing. ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*value_type_size=*/sizeof(int))); ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data())); ICING_ASSERT_OK(persistent_hash_map->Put("c", Serialize(3).data())); @@ -257,8 +279,7 @@ TEST_F(PersistentHashMapTest, TestInitializationSucceedsAfterDestruction) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*value_type_size=*/sizeof(int))); EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2))); EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1)); EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2)); @@ -513,8 +534,11 @@ TEST_F(PersistentHashMapTest, // Create new persistent hash map ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, - PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + PersistentHashMap::Create( + filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data())); @@ -525,7 +549,7 @@ TEST_F(PersistentHashMapTest, ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); } - int32_t new_max_load_factor_percent = 100; + int32_t new_max_load_factor_percent = 200; { ASSERT_THAT(new_max_load_factor_percent, Not(Eq(PersistentHashMap::kDefaultMaxLoadFactorPercent))); @@ -563,7 +587,85 @@ TEST_F(PersistentHashMapTest, std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, /*value_type_size=*/sizeof(int), - new_max_load_factor_percent)); + new_max_load_factor_percent, + kTestInitNumBuckets)); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } +} + +TEST_F(PersistentHashMapTest, + InitializeExistingFilesWithDifferentMaxLoadFactorPercentShouldRehash) { + double prev_loading_percent; + int prev_num_buckets; + { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create( + filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); + ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data())); + ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data())); + ICING_ASSERT_OK(persistent_hash_map->Put("c", Serialize(3).data())); + + ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3))); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1)); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2)); + ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "c"), IsOkAndHolds(3)); + + prev_loading_percent = persistent_hash_map->size() * 100.0 / + persistent_hash_map->num_buckets(); + prev_num_buckets = persistent_hash_map->num_buckets(); + ASSERT_THAT(prev_loading_percent, + Not(Gt(PersistentHashMap::kDefaultMaxLoadFactorPercent))); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + int32_t greater_max_load_factor_percent = 150; + { + ASSERT_THAT(greater_max_load_factor_percent, Gt(prev_loading_percent)); + // Attempt to create the persistent hash map with max load factor greater + // than previous loading. There should be no rehashing and # of buckets + // should remain the same. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + greater_max_load_factor_percent, + kTestInitNumBuckets)); + + EXPECT_THAT(persistent_hash_map->num_buckets(), Eq(prev_num_buckets)); + + ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); + } + + int32_t smaller_max_load_factor_percent = 25; + { + ASSERT_THAT(smaller_max_load_factor_percent, Lt(prev_loading_percent)); + // Attempt to create the persistent hash map with max load factor smaller + // than previous loading. There should be rehashing since the loading + // exceeds the limit. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + smaller_max_load_factor_percent, + kTestInitNumBuckets)); + + // After changing max_load_factor_percent, there should be rehashing and the + // new loading should not be greater than the new max load factor. + EXPECT_THAT(persistent_hash_map->size() * 100.0 / + persistent_hash_map->num_buckets(), + Not(Gt(smaller_max_load_factor_percent))); + EXPECT_THAT(persistent_hash_map->num_buckets(), Not(Eq(prev_num_buckets))); + + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1)); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2)); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "c"), IsOkAndHolds(3)); ICING_ASSERT_OK(persistent_hash_map->PersistToDisk()); } @@ -574,7 +676,9 @@ TEST_F(PersistentHashMapTest, PutAndGet) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty())); EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), @@ -603,7 +707,9 @@ TEST_F(PersistentHashMapTest, PutShouldOverwriteValueIfKeyExists) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com", Serialize(100).data())); @@ -624,12 +730,45 @@ TEST_F(PersistentHashMapTest, PutShouldOverwriteValueIfKeyExists) { IsOkAndHolds(300)); } +TEST_F(PersistentHashMapTest, ShouldRehash) { + // Create new persistent hash map + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create(filesystem_, base_dir_, + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); + + int original_num_buckets = persistent_hash_map->num_buckets(); + // Insert 100 key value pairs. There should be rehashing so the loading of + // hash map doesn't exceed max_load_factor. + for (int i = 0; i < 100; ++i) { + std::string key = "default-google.com-" + std::to_string(i); + ICING_ASSERT_OK(persistent_hash_map->Put(key, &i)); + ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(i + 1))); + + EXPECT_THAT(persistent_hash_map->size() * 100.0 / + persistent_hash_map->num_buckets(), + Not(Gt(PersistentHashMap::kDefaultMaxLoadFactorPercent))); + } + EXPECT_THAT(persistent_hash_map->num_buckets(), + Not(Eq(original_num_buckets))); + + // After rehashing, we should still be able to get all inserted entries. + for (int i = 0; i < 100; ++i) { + std::string key = "default-google.com-" + std::to_string(i); + EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), key), IsOkAndHolds(i)); + } +} + TEST_F(PersistentHashMapTest, GetOrPutShouldPutIfKeyDoesNotExist) { // Create new persistent hash map ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -648,7 +787,9 @@ TEST_F(PersistentHashMapTest, GetOrPutShouldGetIfKeyExists) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); ASSERT_THAT( persistent_hash_map->Put("default-google.com", Serialize(1).data()), @@ -670,7 +811,9 @@ TEST_F(PersistentHashMapTest, Delete) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); // Delete a non-existing key should get NOT_FOUND error EXPECT_THAT(persistent_hash_map->Delete("default-google.com"), @@ -714,7 +857,9 @@ TEST_F(PersistentHashMapTest, DeleteMultiple) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); std::unordered_map<std::string, int> existing_keys; std::unordered_set<std::string> deleted_keys; @@ -766,7 +911,8 @@ TEST_F(PersistentHashMapTest, DeleteBucketHeadElement) { std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*max_load_factor_percent=*/1000, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com-0", Serialize(0).data())); @@ -799,7 +945,8 @@ TEST_F(PersistentHashMapTest, DeleteBucketIntermediateElement) { std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*max_load_factor_percent=*/1000, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com-0", Serialize(0).data())); @@ -831,7 +978,8 @@ TEST_F(PersistentHashMapTest, DeleteBucketTailElement) { std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*max_load_factor_percent=*/1000, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com-0", Serialize(0).data())); @@ -864,7 +1012,8 @@ TEST_F(PersistentHashMapTest, DeleteBucketOnlySingleElement) { std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, /*value_type_size=*/sizeof(int), - /*max_load_factor_percent=*/1000)); + /*max_load_factor_percent=*/1000, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com", Serialize(100).data())); @@ -903,7 +1052,9 @@ TEST_F(PersistentHashMapTest, EmptyHashMapIterator) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); EXPECT_FALSE(persistent_hash_map->GetIterator().Advance()); } @@ -913,7 +1064,9 @@ TEST_F(PersistentHashMapTest, Iterator) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); std::unordered_map<std::string, int> kvps; // Insert 100 key value pairs @@ -933,7 +1086,9 @@ TEST_F(PersistentHashMapTest, IteratorAfterDeletingFirstKeyValuePair) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com-0", Serialize(0).data())); @@ -955,7 +1110,9 @@ TEST_F(PersistentHashMapTest, IteratorAfterDeletingIntermediateKeyValuePair) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com-0", Serialize(0).data())); @@ -977,7 +1134,9 @@ TEST_F(PersistentHashMapTest, IteratorAfterDeletingLastKeyValuePair) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com-0", Serialize(0).data())); @@ -999,7 +1158,9 @@ TEST_F(PersistentHashMapTest, IteratorAfterDeletingAllKeyValuePairs) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<PersistentHashMap> persistent_hash_map, PersistentHashMap::Create(filesystem_, base_dir_, - /*value_type_size=*/sizeof(int))); + /*value_type_size=*/sizeof(int), + PersistentHashMap::kDefaultMaxLoadFactorPercent, + kTestInitNumBuckets)); ICING_ASSERT_OK( persistent_hash_map->Put("default-google.com-0", Serialize(0).data())); |