diff options
author | Tim Barron <tjbarron@google.com> | 2022-12-12 18:03:05 -0800 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2022-12-12 18:03:05 -0800 |
commit | 8ddc32ad433ea147de80dcfac2afe58962360f18 (patch) | |
tree | 50c30cb98396499bf1d6caf33b383f7f4bbc7e58 /icing/store | |
parent | 2658f90984737e5bf6c76d82024103dccd4d51c6 (diff) | |
download | icing-8ddc32ad433ea147de80dcfac2afe58962360f18.tar.gz |
Sync from upstream.
Descriptions:
======================================================================
Add ScoringSpec into JoinSpec. Rename joined_document to child_document.
======================================================================
Create JoinedScoredDocumentHit class and refactor ScoredDocumentHitsRanker.
======================================================================
Implement initial Join workflow
======================================================================
Implement the Lexer for Icing Advanced Query Language
======================================================================
Create struct Options for PersistentHashMap
======================================================================
Premapping FileBackedVector
======================================================================
Create class PersistentHashMapKeyMapper
======================================================================
Add integer sections into TokenizedDocument and rename string sections
======================================================================
Create NumericIndex interface and DocHitInfoIteratorNumeric
======================================================================
Implement DummyNumericIndex and unit test
======================================================================
Change PostingListAccessor::Finalize to rvalue member function
======================================================================
Define the Abstract Syntax Tree for Icing's list_filter parser.
======================================================================
Refactor query processing and score
======================================================================
Refactor IcingSearchEngine for AppSearch Dynamite Module 0p APIs
======================================================================
Implement the Lexer for Icing Advanced Scoring Language
======================================================================
Add a common interface for IcingSearchEngine and dynamite client
======================================================================
Implement a subset of the query grammar.
======================================================================
Refactor index processor
======================================================================
Add integer index into IcingSearchEngine and IndexProcessor
======================================================================
Implement the parser for Icing Advanced Scoring Language
======================================================================
Implement IntegerIndexData and PostingListUsedIntegerIndexDataSerializer
======================================================================
Add PostingListAccessor abstract class for common components and methods
======================================================================
Implement PostingListIntegerIndexDataAccessor
======================================================================
Create PostingListIntegerIndexDataAccessorTest
======================================================================
Fix Icing Segmentation tests for word connectors that changed in ICU 72.
======================================================================
Modify the Advanced Query grammar to allow functions to accept expressions.
======================================================================
Implement QueryVisitor.
======================================================================
Enable the Advanced Query Parser to handle member functions
======================================================================
Refactor the Scorer class to support the Advanced Scoring Language
======================================================================
Integrate advanced query parser with the query processor.
======================================================================
Implement support for JoinSpec in Icing.
======================================================================
Implement the Advanced Scoring Language for basic functions and operators
======================================================================
Bug: 208654892
Bug: 249829533
Bug: 256022027
Bug: 261474063
Bug: 240333360
Bug: 193919210
Change-Id: I5f5bdc6249282ecc4b014b4fbdf8e2d1f8b20c19
Diffstat (limited to 'icing/store')
-rw-r--r-- | icing/store/document-store.cc | 4 | ||||
-rw-r--r-- | icing/store/dynamic-trie-key-mapper.h | 7 | ||||
-rw-r--r-- | icing/store/dynamic-trie-key-mapper_test.cc | 167 | ||||
-rw-r--r-- | icing/store/key-mapper_benchmark.cc | 316 | ||||
-rw-r--r-- | icing/store/key-mapper_test.cc | 215 | ||||
-rw-r--r-- | icing/store/persistent-hash-map-key-mapper.h | 209 | ||||
-rw-r--r-- | icing/store/persistent-hash-map-key-mapper_test.cc | 52 |
7 files changed, 811 insertions, 159 deletions
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 9a33682..62599c8 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -1678,8 +1678,8 @@ DocumentStore::OptimizeInto(const std::string& new_directory, } TokenizedDocument tokenized_document( std::move(tokenized_document_or).ValueOrDie()); - new_document_id_or = - new_doc_store->Put(document_to_keep, tokenized_document.num_tokens()); + new_document_id_or = new_doc_store->Put( + document_to_keep, tokenized_document.num_string_tokens()); } else { // TODO(b/144458732): Implement a more robust version of // TC_ASSIGN_OR_RETURN that can support error logging. diff --git a/icing/store/dynamic-trie-key-mapper.h b/icing/store/dynamic-trie-key-mapper.h index 35d2200..63e8488 100644 --- a/icing/store/dynamic-trie-key-mapper.h +++ b/icing/store/dynamic-trie-key-mapper.h @@ -60,12 +60,15 @@ class DynamicTrieKeyMapper : public KeyMapper<T, Formatter> { Create(const Filesystem& filesystem, std::string_view base_dir, int maximum_size_bytes); - // Deletes all the files associated with the DynamicTrieKeyMapper. Returns - // success or any encountered IO errors + // Deletes all the files associated with the DynamicTrieKeyMapper. // // base_dir : Base directory used to save all the files required to persist // DynamicTrieKeyMapper. Should be the same as passed into // Create(). + // + // Returns + // OK on success + // INTERNAL_ERROR on I/O error static libtextclassifier3::Status Delete(const Filesystem& filesystem, std::string_view base_dir); diff --git a/icing/store/dynamic-trie-key-mapper_test.cc b/icing/store/dynamic-trie-key-mapper_test.cc index add88bb..fd56170 100644 --- a/icing/store/dynamic-trie-key-mapper_test.cc +++ b/icing/store/dynamic-trie-key-mapper_test.cc @@ -14,21 +14,21 @@ #include "icing/store/dynamic-trie-key-mapper.h" +#include <string> + +#include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/file/filesystem.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" -using ::testing::_; -using ::testing::HasSubstr; -using ::testing::IsEmpty; -using ::testing::Pair; -using ::testing::UnorderedElementsAre; - namespace icing { namespace lib { + namespace { + constexpr int kMaxDynamicTrieKeyMapperSize = 3 * 1024 * 1024; // 3 MiB class DynamicTrieKeyMapperTest : public testing::Test { @@ -43,168 +43,25 @@ class DynamicTrieKeyMapperTest : public testing::Test { Filesystem filesystem_; }; -std::unordered_map<std::string, DocumentId> GetAllKeyValuePairs( - const DynamicTrieKeyMapper<DocumentId>* key_mapper) { - std::unordered_map<std::string, DocumentId> ret; - - std::unique_ptr<typename KeyMapper<DocumentId>::Iterator> itr = - key_mapper->GetIterator(); - while (itr->Advance()) { - ret.emplace(itr->GetKey(), itr->GetValue()); - } - return ret; -} - TEST_F(DynamicTrieKeyMapperTest, InvalidBaseDir) { - ASSERT_THAT(DynamicTrieKeyMapper<DocumentId>::Create( - filesystem_, "/dev/null", kMaxDynamicTrieKeyMapperSize) - .status() - .error_message(), - HasSubstr("Failed to create DynamicTrieKeyMapper")); + EXPECT_THAT(DynamicTrieKeyMapper<DocumentId>::Create( + filesystem_, "/dev/null", kMaxDynamicTrieKeyMapperSize), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } TEST_F(DynamicTrieKeyMapperTest, NegativeMaxKeyMapperSizeReturnsInternalError) { - ASSERT_THAT( + EXPECT_THAT( DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, -1), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } TEST_F(DynamicTrieKeyMapperTest, TooLargeMaxKeyMapperSizeReturnsInternalError) { - ASSERT_THAT(DynamicTrieKeyMapper<DocumentId>::Create( + EXPECT_THAT(DynamicTrieKeyMapper<DocumentId>::Create( filesystem_, base_dir_, std::numeric_limits<int>::max()), StatusIs(libtextclassifier3::StatusCode::INTERNAL)); } -TEST_F(DynamicTrieKeyMapperTest, CreateNewKeyMapper) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DynamicTrieKeyMapper<DocumentId>> key_mapper, - DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, - kMaxDynamicTrieKeyMapperSize)); - EXPECT_THAT(key_mapper->num_keys(), 0); -} - -TEST_F(DynamicTrieKeyMapperTest, CanUpdateSameKeyMultipleTimes) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DynamicTrieKeyMapper<DocumentId>> key_mapper, - DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, - kMaxDynamicTrieKeyMapperSize)); - - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); - ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 50)); - - EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100)); - - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 200)); - EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(200)); - EXPECT_THAT(key_mapper->num_keys(), 2); - - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300)); - EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300)); - EXPECT_THAT(key_mapper->num_keys(), 2); -} - -TEST_F(DynamicTrieKeyMapperTest, GetOrPutOk) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DynamicTrieKeyMapper<DocumentId>> key_mapper, - DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, - kMaxDynamicTrieKeyMapperSize)); - - EXPECT_THAT(key_mapper->Get("foo"), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(key_mapper->GetOrPut("foo", 1), IsOkAndHolds(1)); - EXPECT_THAT(key_mapper->Get("foo"), IsOkAndHolds(1)); -} - -TEST_F(DynamicTrieKeyMapperTest, CanPersistToDiskRegularly) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DynamicTrieKeyMapper<DocumentId>> key_mapper, - DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, - kMaxDynamicTrieKeyMapperSize)); - // Can persist an empty DynamicTrieKeyMapper. - ICING_EXPECT_OK(key_mapper->PersistToDisk()); - EXPECT_THAT(key_mapper->num_keys(), 0); - - // Can persist the smallest DynamicTrieKeyMapper. - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); - ICING_EXPECT_OK(key_mapper->PersistToDisk()); - EXPECT_THAT(key_mapper->num_keys(), 1); - EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100)); - - // Can continue to add keys after PersistToDisk(). - ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 200)); - EXPECT_THAT(key_mapper->num_keys(), 2); - EXPECT_THAT(key_mapper->Get("default-youtube.com"), IsOkAndHolds(200)); - - // Can continue to update the same key after PersistToDisk(). - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300)); - EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300)); - EXPECT_THAT(key_mapper->num_keys(), 2); -} - -TEST_F(DynamicTrieKeyMapperTest, CanUseAcrossMultipleInstances) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DynamicTrieKeyMapper<DocumentId>> key_mapper, - DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, - kMaxDynamicTrieKeyMapperSize)); - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); - ICING_EXPECT_OK(key_mapper->PersistToDisk()); - - key_mapper.reset(); - ICING_ASSERT_OK_AND_ASSIGN( - key_mapper, DynamicTrieKeyMapper<DocumentId>::Create( - filesystem_, base_dir_, kMaxDynamicTrieKeyMapperSize)); - EXPECT_THAT(key_mapper->num_keys(), 1); - EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100)); - - // Can continue to read/write to the KeyMapper. - ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 200)); - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300)); - EXPECT_THAT(key_mapper->num_keys(), 2); - EXPECT_THAT(key_mapper->Get("default-youtube.com"), IsOkAndHolds(200)); - EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300)); -} - -TEST_F(DynamicTrieKeyMapperTest, CanDeleteAndRestartKeyMapping) { - // Can delete even if there's nothing there - ICING_EXPECT_OK( - DynamicTrieKeyMapper<DocumentId>::Delete(filesystem_, base_dir_)); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DynamicTrieKeyMapper<DocumentId>> key_mapper, - DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, - kMaxDynamicTrieKeyMapperSize)); - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); - ICING_EXPECT_OK(key_mapper->PersistToDisk()); - ICING_EXPECT_OK( - DynamicTrieKeyMapper<DocumentId>::Delete(filesystem_, base_dir_)); - - key_mapper.reset(); - ICING_ASSERT_OK_AND_ASSIGN( - key_mapper, DynamicTrieKeyMapper<DocumentId>::Create( - filesystem_, base_dir_, kMaxDynamicTrieKeyMapperSize)); - EXPECT_THAT(key_mapper->num_keys(), 0); - ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); - EXPECT_THAT(key_mapper->num_keys(), 1); -} - -TEST_F(DynamicTrieKeyMapperTest, Iterator) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DynamicTrieKeyMapper<DocumentId>> key_mapper, - DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, - kMaxDynamicTrieKeyMapperSize)); - EXPECT_THAT(GetAllKeyValuePairs(key_mapper.get()), IsEmpty()); - - ICING_EXPECT_OK(key_mapper->Put("foo", /*value=*/1)); - ICING_EXPECT_OK(key_mapper->Put("bar", /*value=*/2)); - EXPECT_THAT(GetAllKeyValuePairs(key_mapper.get()), - UnorderedElementsAre(Pair("foo", 1), Pair("bar", 2))); - - ICING_EXPECT_OK(key_mapper->Put("baz", /*value=*/3)); - EXPECT_THAT( - GetAllKeyValuePairs(key_mapper.get()), - UnorderedElementsAre(Pair("foo", 1), Pair("bar", 2), Pair("baz", 3))); -} - } // namespace + } // namespace lib } // namespace icing diff --git a/icing/store/key-mapper_benchmark.cc b/icing/store/key-mapper_benchmark.cc new file mode 100644 index 0000000..b649bc7 --- /dev/null +++ b/icing/store/key-mapper_benchmark.cc @@ -0,0 +1,316 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <random> +#include <string> +#include <unordered_map> + +#include "testing/base/public/benchmark.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/destructible-directory.h" +#include "icing/file/filesystem.h" +#include "icing/store/dynamic-trie-key-mapper.h" +#include "icing/store/key-mapper.h" +#include "icing/store/persistent-hash-map-key-mapper.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/random-string.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::Not; + +class KeyMapperBenchmark { + public: + static constexpr int kKeyLength = 20; + + explicit KeyMapperBenchmark() + : clock(std::make_unique<Clock>()), + base_dir(GetTestTempDir() + "/key_mapper_benchmark"), + random_engine(/*seed=*/12345) {} + + std::string GenerateUniqueRandomKeyValuePair(int val, + std::string_view prefix = "") { + std::string rand_str = absl_ports::StrCat( + prefix, RandomString(kAlNumAlphabet, kKeyLength, &random_engine)); + while (random_kvps_map.find(rand_str) != random_kvps_map.end()) { + rand_str = absl_ports::StrCat( + std::string(prefix), + RandomString(kAlNumAlphabet, kKeyLength, &random_engine)); + } + std::pair<std::string, int> entry(rand_str, val); + random_kvps.push_back(entry); + random_kvps_map.insert(entry); + return rand_str; + } + + template <typename UnknownKeyMapperType> + libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<int>>> CreateKeyMapper( + int max_num_entries) { + return absl_ports::InvalidArgumentError("Unknown type"); + } + + template <> + libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<int>>> + CreateKeyMapper<DynamicTrieKeyMapper<int>>(int max_num_entries) { + return DynamicTrieKeyMapper<int>::Create( + filesystem, base_dir, + /*maximum_size_bytes=*/128 * 1024 * 1024); + } + + template <> + libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<int>>> + CreateKeyMapper<PersistentHashMapKeyMapper<int>>(int max_num_entries) { + return PersistentHashMapKeyMapper<int>::Create( + filesystem, base_dir, max_num_entries, + /*average_kv_byte_size=*/kKeyLength + 1 + sizeof(int), + /*max_load_factor_percent=*/100); + } + + std::unique_ptr<Clock> clock; + + Filesystem filesystem; + std::string base_dir; + + std::default_random_engine random_engine; + std::vector<std::pair<std::string, int>> random_kvps; + std::unordered_map<std::string, int> random_kvps_map; +}; + +// Benchmark the total time of putting num_keys (specified by Arg) unique random +// key value pairs. +template <typename KeyMapperType> +void BM_PutMany(benchmark::State& state) { + int num_keys = state.range(0); + + KeyMapperBenchmark benchmark; + for (int i = 0; i < num_keys; ++i) { + benchmark.GenerateUniqueRandomKeyValuePair(i); + } + + for (auto _ : state) { + state.PauseTiming(); + benchmark.filesystem.DeleteDirectoryRecursively(benchmark.base_dir.c_str()); + DestructibleDirectory ddir(&benchmark.filesystem, benchmark.base_dir); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<int>> key_mapper, + benchmark.CreateKeyMapper<KeyMapperType>(num_keys)); + ASSERT_THAT(key_mapper->num_keys(), Eq(0)); + state.ResumeTiming(); + + for (int i = 0; i < num_keys; ++i) { + ICING_ASSERT_OK(key_mapper->Put(benchmark.random_kvps[i].first, + benchmark.random_kvps[i].second)); + } + + // Explicit calls PersistToDisk. + ICING_ASSERT_OK(key_mapper->PersistToDisk()); + + state.PauseTiming(); + ASSERT_THAT(key_mapper->num_keys(), Eq(num_keys)); + // The destructor of IcingDynamicTrie doesn't implicitly call PersistToDisk, + // while PersistentHashMap does. Thus, we reset the unique pointer to invoke + // destructor in the pause timing block, so in this case PersistToDisk will + // be included into the benchmark only once. + key_mapper.reset(); + state.ResumeTiming(); + } +} +BENCHMARK(BM_PutMany<DynamicTrieKeyMapper<int>>) + ->Arg(1 << 10) + ->Arg(1 << 11) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 15) + ->Arg(1 << 16) + ->Arg(1 << 17) + ->Arg(1 << 18) + ->Arg(1 << 19) + ->Arg(1 << 20); +BENCHMARK(BM_PutMany<PersistentHashMapKeyMapper<int>>) + ->Arg(1 << 10) + ->Arg(1 << 11) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 15) + ->Arg(1 << 16) + ->Arg(1 << 17) + ->Arg(1 << 18) + ->Arg(1 << 19) + ->Arg(1 << 20); + +// Benchmark the average time of putting 1 unique random key value pair. The +// result will be affected by # of iterations, so use --benchmark_max_iters=k +// and --benchmark_min_iters=k to force # of iterations to be fixed. +template <typename KeyMapperType> +void BM_Put(benchmark::State& state) { + KeyMapperBenchmark benchmark; + benchmark.filesystem.DeleteDirectoryRecursively(benchmark.base_dir.c_str()); + DestructibleDirectory ddir(&benchmark.filesystem, benchmark.base_dir); + + // The overhead of state.PauseTiming is too large and affects the benchmark + // result a lot, so pre-generate enough kvps to avoid calling too many times + // state.PauseTiming for GenerateUniqueRandomKeyValuePair in the benchmark + // for-loop. + int MAX_PREGEN_KVPS = 1 << 22; + for (int i = 0; i < MAX_PREGEN_KVPS; ++i) { + benchmark.GenerateUniqueRandomKeyValuePair(i); + } + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<int>> key_mapper, + benchmark.CreateKeyMapper<KeyMapperType>(/*max_num_entries=*/1 << 22)); + ASSERT_THAT(key_mapper->num_keys(), Eq(0)); + + int cnt = 0; + for (auto _ : state) { + if (cnt >= MAX_PREGEN_KVPS) { + state.PauseTiming(); + benchmark.GenerateUniqueRandomKeyValuePair(cnt); + state.ResumeTiming(); + } + + ICING_ASSERT_OK(key_mapper->Put(benchmark.random_kvps[cnt].first, + benchmark.random_kvps[cnt].second)); + ++cnt; + } +} +BENCHMARK(BM_Put<DynamicTrieKeyMapper<int>>); +BENCHMARK(BM_Put<PersistentHashMapKeyMapper<int>>); + +// Benchmark the average time of getting 1 existing key value pair from the key +// mapper with size num_keys (specified by Arg). +template <typename KeyMapperType> +void BM_Get(benchmark::State& state) { + int num_keys = state.range(0); + + KeyMapperBenchmark benchmark; + benchmark.filesystem.DeleteDirectoryRecursively(benchmark.base_dir.c_str()); + DestructibleDirectory ddir(&benchmark.filesystem, benchmark.base_dir); + + // Create a key mapper with num_keys entries. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<int>> key_mapper, + benchmark.CreateKeyMapper<KeyMapperType>(num_keys)); + for (int i = 0; i < num_keys; ++i) { + ICING_ASSERT_OK( + key_mapper->Put(benchmark.GenerateUniqueRandomKeyValuePair(i), i)); + } + ASSERT_THAT(key_mapper->num_keys(), Eq(num_keys)); + + std::uniform_int_distribution<> distrib(0, num_keys - 1); + std::default_random_engine e(/*seed=*/12345); + for (auto _ : state) { + int idx = distrib(e); + ICING_ASSERT_OK_AND_ASSIGN( + int val, key_mapper->Get(benchmark.random_kvps[idx].first)); + ASSERT_THAT(val, Eq(benchmark.random_kvps[idx].second)); + } +} +BENCHMARK(BM_Get<DynamicTrieKeyMapper<int>>) + ->Arg(1 << 10) + ->Arg(1 << 11) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 15) + ->Arg(1 << 16) + ->Arg(1 << 17) + ->Arg(1 << 18) + ->Arg(1 << 19) + ->Arg(1 << 20); +BENCHMARK(BM_Get<PersistentHashMapKeyMapper<int>>) + ->Arg(1 << 10) + ->Arg(1 << 11) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 15) + ->Arg(1 << 16) + ->Arg(1 << 17) + ->Arg(1 << 18) + ->Arg(1 << 19) + ->Arg(1 << 20); + +// Benchmark the total time of iterating through all key value pairs of the key +// mapper with size num_keys (specified by Arg). +template <typename KeyMapperType> +void BM_Iterator(benchmark::State& state) { + int num_keys = state.range(0); + + KeyMapperBenchmark benchmark; + benchmark.filesystem.DeleteDirectoryRecursively(benchmark.base_dir.c_str()); + DestructibleDirectory ddir(&benchmark.filesystem, benchmark.base_dir); + + // Create a key mapper with num_keys entries. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<KeyMapper<int>> key_mapper, + benchmark.CreateKeyMapper<KeyMapperType>(num_keys)); + for (int i = 0; i < num_keys; ++i) { + ICING_ASSERT_OK( + key_mapper->Put(benchmark.GenerateUniqueRandomKeyValuePair(i), i)); + } + ASSERT_THAT(key_mapper->num_keys(), Eq(num_keys)); + + for (auto _ : state) { + auto iter = key_mapper->GetIterator(); + int cnt = 0; + while (iter->Advance()) { + ++cnt; + std::string key(iter->GetKey()); + int value = iter->GetValue(); + auto it = benchmark.random_kvps_map.find(key); + ASSERT_THAT(it, Not(Eq(benchmark.random_kvps_map.end()))); + ASSERT_THAT(it->second, Eq(value)); + } + ASSERT_THAT(cnt, Eq(num_keys)); + } +} +BENCHMARK(BM_Iterator<DynamicTrieKeyMapper<int>>) + ->Arg(1 << 10) + ->Arg(1 << 11) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 15) + ->Arg(1 << 16) + ->Arg(1 << 17) + ->Arg(1 << 18) + ->Arg(1 << 19) + ->Arg(1 << 20); +BENCHMARK(BM_Iterator<PersistentHashMapKeyMapper<int>>) + ->Arg(1 << 10) + ->Arg(1 << 11) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 15) + ->Arg(1 << 16) + ->Arg(1 << 17) + ->Arg(1 << 18) + ->Arg(1 << 19) + ->Arg(1 << 20); + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/store/key-mapper_test.cc b/icing/store/key-mapper_test.cc new file mode 100644 index 0000000..682888d --- /dev/null +++ b/icing/store/key-mapper_test.cc @@ -0,0 +1,215 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/store/key-mapper.h" + +#include <memory> +#include <string> +#include <type_traits> +#include <unordered_map> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/filesystem.h" +#include "icing/store/document-id.h" +#include "icing/store/dynamic-trie-key-mapper.h" +#include "icing/store/persistent-hash-map-key-mapper.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +using ::testing::IsEmpty; +using ::testing::Pair; +using ::testing::UnorderedElementsAre; + +namespace icing { +namespace lib { + +namespace { + +constexpr int kMaxDynamicTrieKeyMapperSize = 3 * 1024 * 1024; // 3 MiB + +template <typename T> +class KeyMapperTest : public ::testing::Test { + protected: + using KeyMapperType = T; + + void SetUp() override { base_dir_ = GetTestTempDir() + "/key_mapper"; } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + template <typename UnknownKeyMapperType> + libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<DocumentId>>> + CreateKeyMapper() { + return absl_ports::InvalidArgumentError("Unknown type"); + } + + template <> + libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<DocumentId>>> + CreateKeyMapper<DynamicTrieKeyMapper<DocumentId>>() { + return DynamicTrieKeyMapper<DocumentId>::Create( + filesystem_, base_dir_, kMaxDynamicTrieKeyMapperSize); + } + + template <> + libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<DocumentId>>> + CreateKeyMapper<PersistentHashMapKeyMapper<DocumentId>>() { + return PersistentHashMapKeyMapper<DocumentId>::Create(filesystem_, + base_dir_); + } + + std::string base_dir_; + Filesystem filesystem_; +}; + +using TestTypes = ::testing::Types<DynamicTrieKeyMapper<DocumentId>, + PersistentHashMapKeyMapper<DocumentId>>; +TYPED_TEST_SUITE(KeyMapperTest, TestTypes); + +std::unordered_map<std::string, DocumentId> GetAllKeyValuePairs( + const KeyMapper<DocumentId>* key_mapper) { + std::unordered_map<std::string, DocumentId> ret; + + std::unique_ptr<typename KeyMapper<DocumentId>::Iterator> itr = + key_mapper->GetIterator(); + while (itr->Advance()) { + ret.emplace(itr->GetKey(), itr->GetValue()); + } + return ret; +} + +TYPED_TEST(KeyMapperTest, CreateNewKeyMapper) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper, + this->template CreateKeyMapper<TypeParam>()); + EXPECT_THAT(key_mapper->num_keys(), 0); +} + +TYPED_TEST(KeyMapperTest, CanUpdateSameKeyMultipleTimes) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper, + this->template CreateKeyMapper<TypeParam>()); + + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); + ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 50)); + + EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100)); + + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 200)); + EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(200)); + EXPECT_THAT(key_mapper->num_keys(), 2); + + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300)); + EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300)); + EXPECT_THAT(key_mapper->num_keys(), 2); +} + +TYPED_TEST(KeyMapperTest, GetOrPutOk) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper, + this->template CreateKeyMapper<TypeParam>()); + + EXPECT_THAT(key_mapper->Get("foo"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(key_mapper->GetOrPut("foo", 1), IsOkAndHolds(1)); + EXPECT_THAT(key_mapper->Get("foo"), IsOkAndHolds(1)); +} + +TYPED_TEST(KeyMapperTest, CanPersistToDiskRegularly) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper, + this->template CreateKeyMapper<TypeParam>()); + + // Can persist an empty DynamicTrieKeyMapper. + ICING_EXPECT_OK(key_mapper->PersistToDisk()); + EXPECT_THAT(key_mapper->num_keys(), 0); + + // Can persist the smallest DynamicTrieKeyMapper. + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); + ICING_EXPECT_OK(key_mapper->PersistToDisk()); + EXPECT_THAT(key_mapper->num_keys(), 1); + EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100)); + + // Can continue to add keys after PersistToDisk(). + ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 200)); + EXPECT_THAT(key_mapper->num_keys(), 2); + EXPECT_THAT(key_mapper->Get("default-youtube.com"), IsOkAndHolds(200)); + + // Can continue to update the same key after PersistToDisk(). + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300)); + EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300)); + EXPECT_THAT(key_mapper->num_keys(), 2); +} + +TYPED_TEST(KeyMapperTest, CanUseAcrossMultipleInstances) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper, + this->template CreateKeyMapper<TypeParam>()); + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); + ICING_EXPECT_OK(key_mapper->PersistToDisk()); + + key_mapper.reset(); + + ICING_ASSERT_OK_AND_ASSIGN(key_mapper, + this->template CreateKeyMapper<TypeParam>()); + EXPECT_THAT(key_mapper->num_keys(), 1); + EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100)); + + // Can continue to read/write to the KeyMapper. + ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 200)); + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 300)); + EXPECT_THAT(key_mapper->num_keys(), 2); + EXPECT_THAT(key_mapper->Get("default-youtube.com"), IsOkAndHolds(200)); + EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300)); +} + +TYPED_TEST(KeyMapperTest, CanDeleteAndRestartKeyMapping) { + // Can delete even if there's nothing there + ICING_EXPECT_OK( + TestFixture::KeyMapperType::Delete(this->filesystem_, this->base_dir_)); + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper, + this->template CreateKeyMapper<TypeParam>()); + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); + ICING_EXPECT_OK(key_mapper->PersistToDisk()); + ICING_EXPECT_OK( + TestFixture::KeyMapperType::Delete(this->filesystem_, this->base_dir_)); + + key_mapper.reset(); + ICING_ASSERT_OK_AND_ASSIGN(key_mapper, + this->template CreateKeyMapper<TypeParam>()); + EXPECT_THAT(key_mapper->num_keys(), 0); + ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100)); + EXPECT_THAT(key_mapper->num_keys(), 1); +} + +TYPED_TEST(KeyMapperTest, Iterator) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper, + this->template CreateKeyMapper<TypeParam>()); + EXPECT_THAT(GetAllKeyValuePairs(key_mapper.get()), IsEmpty()); + + ICING_EXPECT_OK(key_mapper->Put("foo", /*value=*/1)); + ICING_EXPECT_OK(key_mapper->Put("bar", /*value=*/2)); + EXPECT_THAT(GetAllKeyValuePairs(key_mapper.get()), + UnorderedElementsAre(Pair("foo", 1), Pair("bar", 2))); + + ICING_EXPECT_OK(key_mapper->Put("baz", /*value=*/3)); + EXPECT_THAT( + GetAllKeyValuePairs(key_mapper.get()), + UnorderedElementsAre(Pair("foo", 1), Pair("bar", 2), Pair("baz", 3))); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/store/persistent-hash-map-key-mapper.h b/icing/store/persistent-hash-map-key-mapper.h new file mode 100644 index 0000000..a13ec11 --- /dev/null +++ b/icing/store/persistent-hash-map-key-mapper.h @@ -0,0 +1,209 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_STORE_PERSISTENT_HASH_MAP_KEY_MAPPER_H_ +#define ICING_STORE_PERSISTENT_HASH_MAP_KEY_MAPPER_H_ + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> +#include <type_traits> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/str_join.h" +#include "icing/file/filesystem.h" +#include "icing/file/persistent-hash-map.h" +#include "icing/store/key-mapper.h" +#include "icing/util/crc32.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +// File-backed mapping between the string key and a trivially copyable value +// type. +template <typename T, typename Formatter = absl_ports::DefaultFormatter> +class PersistentHashMapKeyMapper : public KeyMapper<T, Formatter> { + public: + // Returns an initialized instance of PersistentHashMapKeyMapper that can + // immediately handle read/write operations. + // Returns any encountered IO errors. + // + // filesystem: Object to make system level calls + // base_dir : Base directory used to save all the files required to persist + // PersistentHashMapKeyMapper. If this base_dir was previously used + // to create a PersistentHashMapKeyMapper, then this existing data + // would be loaded. Otherwise, an empty PersistentHashMapKeyMapper + // would be created. + // max_num_entries: max # of kvps. It will be used to compute 3 storages size. + // average_kv_byte_size: average byte size of a single key + serialized value. + // It will be used to compute kv_storage size. + // max_load_factor_percent: percentage of the max loading for the hash map. + // load_factor_percent = 100 * num_keys / num_buckets + // If load_factor_percent exceeds + // max_load_factor_percent, then rehash will be + // invoked (and # of buckets will be doubled). + // Note that load_factor_percent exceeding 100 is + // considered valid. + static libtextclassifier3::StatusOr< + std::unique_ptr<PersistentHashMapKeyMapper<T, Formatter>>> + Create(const Filesystem& filesystem, std::string_view base_dir, + int32_t max_num_entries = PersistentHashMap::Entry::kMaxNumEntries, + int32_t average_kv_byte_size = + PersistentHashMap::Options::kDefaultAverageKVByteSize, + int32_t max_load_factor_percent = + PersistentHashMap::Options::kDefaultMaxLoadFactorPercent); + + // Deletes all the files associated with the PersistentHashMapKeyMapper. + // + // base_dir : Base directory used to save all the files required to persist + // PersistentHashMapKeyMapper. Should be the same as passed into + // Create(). + // + // Returns: + // OK on success + // INTERNAL_ERROR on I/O error + static libtextclassifier3::Status Delete(const Filesystem& filesystem, + std::string_view base_dir); + + ~PersistentHashMapKeyMapper() override = default; + + libtextclassifier3::Status Put(std::string_view key, T value) override { + return persistent_hash_map_->Put(key, &value); + } + + libtextclassifier3::StatusOr<T> GetOrPut(std::string_view key, + T next_value) override { + ICING_RETURN_IF_ERROR(persistent_hash_map_->GetOrPut(key, &next_value)); + return next_value; + } + + libtextclassifier3::StatusOr<T> Get(std::string_view key) const override { + T value; + ICING_RETURN_IF_ERROR(persistent_hash_map_->Get(key, &value)); + return value; + } + + bool Delete(std::string_view key) override { + return persistent_hash_map_->Delete(key).ok(); + } + + std::unique_ptr<typename KeyMapper<T, Formatter>::Iterator> GetIterator() + const override { + return std::make_unique<PersistentHashMapKeyMapper<T, Formatter>::Iterator>( + persistent_hash_map_.get()); + } + + int32_t num_keys() const override { return persistent_hash_map_->size(); } + + libtextclassifier3::Status PersistToDisk() override { + return persistent_hash_map_->PersistToDisk(); + } + + libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const override { + return persistent_hash_map_->GetDiskUsage(); + } + + libtextclassifier3::StatusOr<int64_t> GetElementsSize() const override { + return persistent_hash_map_->GetElementsSize(); + } + + libtextclassifier3::StatusOr<Crc32> ComputeChecksum() override { + return persistent_hash_map_->ComputeChecksum(); + } + + private: + class Iterator : public KeyMapper<T, Formatter>::Iterator { + public: + explicit Iterator(const PersistentHashMap* persistent_hash_map) + : itr_(persistent_hash_map->GetIterator()) {} + + ~Iterator() override = default; + + bool Advance() override { return itr_.Advance(); } + + std::string_view GetKey() const override { return itr_.GetKey(); } + + T GetValue() const override { + T value; + memcpy(&value, itr_.GetValue(), sizeof(T)); + return value; + } + + private: + PersistentHashMap::Iterator itr_; + }; + + static constexpr std::string_view kKeyMapperDir = "key_mapper_dir"; + + // Use PersistentHashMapKeyMapper::Create() to instantiate. + explicit PersistentHashMapKeyMapper( + std::unique_ptr<PersistentHashMap> persistent_hash_map) + : persistent_hash_map_(std::move(persistent_hash_map)) {} + + std::unique_ptr<PersistentHashMap> persistent_hash_map_; + + static_assert(std::is_trivially_copyable<T>::value, + "T must be trivially copyable"); +}; + +template <typename T, typename Formatter> +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<PersistentHashMapKeyMapper<T, Formatter>>> +PersistentHashMapKeyMapper<T, Formatter>::Create( + const Filesystem& filesystem, std::string_view base_dir, + int32_t max_num_entries, int32_t average_kv_byte_size, + int32_t max_load_factor_percent) { + const std::string key_mapper_dir = + absl_ports::StrCat(base_dir, "/", kKeyMapperDir); + if (!filesystem.CreateDirectoryRecursively(key_mapper_dir.c_str())) { + return absl_ports::InternalError(absl_ports::StrCat( + "Failed to create PersistentHashMapKeyMapper directory: ", + key_mapper_dir)); + } + + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PersistentHashMap> persistent_hash_map, + PersistentHashMap::Create( + filesystem, key_mapper_dir, + PersistentHashMap::Options( + /*value_type_size_in=*/sizeof(T), + /*max_num_entries_in=*/max_num_entries, + /*max_load_factor_percent_in=*/max_load_factor_percent, + /*average_kv_byte_size_in=*/average_kv_byte_size))); + return std::unique_ptr<PersistentHashMapKeyMapper<T, Formatter>>( + new PersistentHashMapKeyMapper<T, Formatter>( + std::move(persistent_hash_map))); +} + +template <typename T, typename Formatter> +/* static */ libtextclassifier3::Status +PersistentHashMapKeyMapper<T, Formatter>::Delete(const Filesystem& filesystem, + std::string_view base_dir) { + const std::string key_mapper_dir = + absl_ports::StrCat(base_dir, "/", kKeyMapperDir); + if (!filesystem.DeleteDirectoryRecursively(key_mapper_dir.c_str())) { + return absl_ports::InternalError(absl_ports::StrCat( + "Failed to delete PersistentHashMapKeyMapper directory: ", + key_mapper_dir)); + } + return libtextclassifier3::Status::OK; +} + +} // namespace lib +} // namespace icing + +#endif // ICING_STORE_PERSISTENT_HASH_MAP_KEY_MAPPER_H_ diff --git a/icing/store/persistent-hash-map-key-mapper_test.cc b/icing/store/persistent-hash-map-key-mapper_test.cc new file mode 100644 index 0000000..c937c43 --- /dev/null +++ b/icing/store/persistent-hash-map-key-mapper_test.cc @@ -0,0 +1,52 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/store/persistent-hash-map-key-mapper.h" + +#include <string> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/store/document-id.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +class PersistentHashMapKeyMapperTest : public testing::Test { + protected: + void SetUp() override { base_dir_ = GetTestTempDir() + "/key_mapper"; } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + std::string base_dir_; + Filesystem filesystem_; +}; + +TEST_F(PersistentHashMapKeyMapperTest, InvalidBaseDir) { + EXPECT_THAT( + PersistentHashMapKeyMapper<DocumentId>::Create(filesystem_, "/dev/null"), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +} // namespace + +} // namespace lib +} // namespace icing |