diff options
Diffstat (limited to 'icing/store')
-rw-r--r-- | icing/store/document-store.cc | 63 | ||||
-rw-r--r-- | icing/store/document-store.h | 23 | ||||
-rw-r--r-- | icing/store/document-store_test.cc | 27 |
3 files changed, 63 insertions, 50 deletions
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 35ee172..710ff58 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -89,17 +89,6 @@ constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB -// Whether to use namespace id or namespace name to build up fingerprint for -// document_key_mapper_ and corpus_mapper_. -// Note: Changing this flag will require a reconstruction of the internal -// mappers in the document store. A easy way to trigger a rebuild is to change -// the kMagic value. -// -// TODO(b/259969017) Flip this flag to true at the time when we switch to use -// persistent hash map for document_key_mapper_ so that we just need one -// reconstruction of the internal mappers. -constexpr bool kNamespaceIdFingerprint = false; - DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) { DocumentWrapper document_wrapper; *document_wrapper.mutable_document() = std::move(document); @@ -157,23 +146,6 @@ std::string EncodeNamespaceId(NamespaceId namespace_id) { return encoding; } -std::string MakeFingerprint(NamespaceId namespace_id, - std::string_view namespace_, - std::string_view uri_or_schema) { - if (!kNamespaceIdFingerprint) { - // Using a 64-bit fingerprint to represent the key could lead to collisions. - // But, even with 200K unique keys, the probability of collision is about - // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack). - uint64_t fprint = tc3farmhash::Fingerprint64( - absl_ports::StrCat(namespace_, uri_or_schema)); - return fingerprint_util::GetFingerprintString(fprint); - } else { - return absl_ports::StrCat(EncodeNamespaceId(namespace_id), - encode_util::EncodeIntToCString( - tc3farmhash::Fingerprint64(uri_or_schema))); - } -} - int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms, int64_t ttl_ms) { if (ttl_ms == 0) { @@ -236,15 +208,34 @@ std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces( } // namespace +std::string DocumentStore::MakeFingerprint( + NamespaceId namespace_id, std::string_view namespace_, + std::string_view uri_or_schema) const { + if (!namespace_id_fingerprint_) { + // Using a 64-bit fingerprint to represent the key could lead to collisions. + // But, even with 200K unique keys, the probability of collision is about + // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack). + uint64_t fprint = tc3farmhash::Fingerprint64( + absl_ports::StrCat(namespace_, uri_or_schema)); + return fingerprint_util::GetFingerprintString(fprint); + } else { + return absl_ports::StrCat(EncodeNamespaceId(namespace_id), + encode_util::EncodeIntToCString( + tc3farmhash::Fingerprint64(uri_or_schema))); + } +} + DocumentStore::DocumentStore(const Filesystem* filesystem, const std::string_view base_dir, const Clock* clock, - const SchemaStore* schema_store) + const SchemaStore* schema_store, + bool namespace_id_fingerprint) : filesystem_(filesystem), base_dir_(base_dir), clock_(*clock), schema_store_(schema_store), - document_validator_(schema_store) {} + document_validator_(schema_store), + namespace_id_fingerprint_(namespace_id_fingerprint) {} libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( const DocumentProto& document, int32_t num_tokens, @@ -271,14 +262,14 @@ DocumentStore::~DocumentStore() { libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, - bool force_recovery_and_revalidate_documents, + bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint, InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(filesystem); ICING_RETURN_ERROR_IF_NULL(clock); ICING_RETURN_ERROR_IF_NULL(schema_store); - auto document_store = std::unique_ptr<DocumentStore>( - new DocumentStore(filesystem, base_dir, clock, schema_store)); + auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore( + filesystem, base_dir, clock, schema_store, namespace_id_fingerprint)); ICING_ASSIGN_OR_RETURN( DataLoss data_loss, document_store->Initialize(force_recovery_and_revalidate_documents, @@ -386,7 +377,8 @@ libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() { absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_))); } - if (header.magic != DocumentStore::Header::kMagic) { + if (header.magic != + DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_)) { return absl_ports::InternalError(absl_ports::StrCat( "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_))); } @@ -859,7 +851,8 @@ bool DocumentStore::HeaderExists() { libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) { // Write the header DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; + header.magic = + DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_); header.checksum = checksum.Get(); // This should overwrite the header. diff --git a/icing/store/document-store.h b/icing/store/document-store.h index 3e02636..7c414d7 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -59,13 +59,19 @@ namespace lib { class DocumentStore { public: struct Header { - static constexpr int32_t kMagic = 0x746f7265; + static int32_t GetCurrentMagic(bool namespace_id_fingerprint) { + return namespace_id_fingerprint ? kNewMagic : kOldMagic; + } // Holds the magic as a quick sanity check against file corruption. int32_t magic; // Checksum of the DocumentStore's sub-component's checksums. uint32_t checksum; + + private: + static constexpr int32_t kOldMagic = 0x746f7265; + static constexpr int32_t kNewMagic = 0x1b99c8b0; }; struct OptimizeInfo { @@ -136,6 +142,7 @@ class DocumentStore { const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, bool force_recovery_and_revalidate_documents = false, + bool namespace_id_fingerprint = false, InitializeStatsProto* initialize_stats = nullptr); // Returns the maximum DocumentId that the DocumentStore has assigned. If @@ -472,7 +479,8 @@ class DocumentStore { private: // Use DocumentStore::Create() to instantiate. DocumentStore(const Filesystem* filesystem, std::string_view base_dir, - const Clock* clock, const SchemaStore* schema_store); + const Clock* clock, const SchemaStore* schema_store, + bool namespace_id_fingerprint); const Filesystem* const filesystem_; const std::string base_dir_; @@ -485,6 +493,10 @@ class DocumentStore { // Used to validate incoming documents DocumentValidator document_validator_; + // Whether to use namespace id or namespace name to build up fingerprint for + // document_key_mapper_ and corpus_mapper_. + bool namespace_id_fingerprint_; + // A log used to store all documents, it serves as a ground truth of doc // store. key_mapper_ and document_id_mapper_ can be regenerated from it. std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; @@ -733,6 +745,13 @@ class DocumentStore { libtextclassifier3::StatusOr< google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> CollectCorpusInfo() const; + + // Build fingerprint for the keys of document_key_mapper_ and corpus_mapper_. + // Note that namespace_id_fingerprint_ controls the way that a fingerprint is + // built. + std::string MakeFingerprint(NamespaceId namespace_id, + std::string_view namespace_, + std::string_view uri_or_schema) const; }; } // namespace lib diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index a115e11..81da191 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -207,7 +207,8 @@ class DocumentStoreTest : public ::testing::Test { const std::string header_file = absl_ports::StrCat(document_store_dir_, "/document_store_header"); DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; + header.magic = DocumentStore::Header::GetCurrentMagic( + /*namespace_id_fingerprint=*/false); header.checksum = 10; // Arbitrary garbage checksum filesystem_.DeleteFile(header_file.c_str()); filesystem_.Write(header_file.c_str(), &header, sizeof(header)); @@ -3285,10 +3286,10 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); // The document log is using the legacy v0 format so that a migration is @@ -3489,10 +3490,10 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true, + /*namespace_id_fingerprint=*/false, &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); @@ -3875,10 +3876,10 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/false, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, &initialize_stats)); std::unique_ptr<DocumentStore> document_store = std::move(create_result.document_store); |