diff options
author | Tim Barron <tjbarron@google.com> | 2021-01-14 20:53:07 +0000 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2021-01-14 20:53:07 +0000 |
commit | a34db390d80f862bfaaa49dea3605c5fec3bca3d (patch) | |
tree | 67a4a87803cf2b31619c3ddff3674967fc1461ce /icing/store | |
parent | 59c2caa38fd8dca3760dad751f4f8e5de8be25f5 (diff) | |
download | icing-a34db390d80f862bfaaa49dea3605c5fec3bca3d.tar.gz |
Update Icing from upstream.
Change-Id: I43038a59e7170fb8ecbaf6098a37221b3682ce09
Diffstat (limited to 'icing/store')
-rw-r--r-- | icing/store/corpus-associated-scoring-data.h | 79 | ||||
-rw-r--r-- | icing/store/corpus-id.h | 2 | ||||
-rw-r--r-- | icing/store/document-associated-score-data.h | 34 | ||||
-rw-r--r-- | icing/store/document-store.cc | 284 | ||||
-rw-r--r-- | icing/store/document-store.h | 77 | ||||
-rw-r--r-- | icing/store/document-store_test.cc | 396 | ||||
-rw-r--r-- | icing/store/enable-bm25f.h | 31 | ||||
-rw-r--r-- | icing/store/usage-store.cc | 4 | ||||
-rw-r--r-- | icing/store/usage-store.h | 9 | ||||
-rw-r--r-- | icing/store/usage-store_test.cc | 17 |
10 files changed, 732 insertions, 201 deletions
diff --git a/icing/store/corpus-associated-scoring-data.h b/icing/store/corpus-associated-scoring-data.h new file mode 100644 index 0000000..52be5cd --- /dev/null +++ b/icing/store/corpus-associated-scoring-data.h @@ -0,0 +1,79 @@ +// Copyright (C) 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_ +#define ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_ + +#include <cstdint> +#include <limits> +#include <type_traits> + +#include "icing/legacy/core/icing-packed-pod.h" + +namespace icing { +namespace lib { + +// This is the cache entity of corpus-associated scores. The ground-truth data +// is stored somewhere else. The cache includes: +// 1. Number of documents contained in the corpus. +// Positive values are required. +// 2. The sum of the documents' lengths, in number of tokens. +class CorpusAssociatedScoreData { + public: + explicit CorpusAssociatedScoreData(int num_docs = 0, + int64_t sum_length_in_tokens = 0) + : sum_length_in_tokens_(sum_length_in_tokens), num_docs_(num_docs) {} + + bool operator==(const CorpusAssociatedScoreData& other) const { + return num_docs_ == other.num_docs() && + sum_length_in_tokens_ == other.sum_length_in_tokens(); + } + + uint32_t num_docs() const { return num_docs_; } + void set_num_docs(uint32_t val) { num_docs_ = val; } + + uint64_t sum_length_in_tokens() const { return sum_length_in_tokens_; } + void set_sum_length_in_tokens(uint64_t val) { sum_length_in_tokens_ = val; } + + float average_doc_length_in_tokens() const { + return sum_length_in_tokens_ / (1.0f + num_docs_); + } + + // Adds a new document. + // Adds the document's length to the total length of the corpus, + // sum_length_in_tokens_. + void AddDocument(uint32_t doc_length_in_tokens) { + ++num_docs_; + sum_length_in_tokens_ = + (std::numeric_limits<int>::max() - doc_length_in_tokens < + sum_length_in_tokens_) + ? std::numeric_limits<int>::max() + : sum_length_in_tokens_ + doc_length_in_tokens; + } + + private: + // The sum total of the length of all documents in the corpus. + int sum_length_in_tokens_; + int num_docs_; +} __attribute__((packed)); + +static_assert(sizeof(CorpusAssociatedScoreData) == 8, + "Size of CorpusAssociatedScoreData should be 8"); +static_assert(icing_is_packed_pod<CorpusAssociatedScoreData>::value, + "go/icing-ubsan"); + +} // namespace lib +} // namespace icing + +#endif // ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_ diff --git a/icing/store/corpus-id.h b/icing/store/corpus-id.h index a8f21ba..01135b9 100644 --- a/icing/store/corpus-id.h +++ b/icing/store/corpus-id.h @@ -24,6 +24,8 @@ namespace lib { // DocumentProto. Generated in DocumentStore. using CorpusId = int32_t; +inline constexpr CorpusId kInvalidCorpusId = -1; + } // namespace lib } // namespace icing diff --git a/icing/store/document-associated-score-data.h b/icing/store/document-associated-score-data.h index b9039c5..9a711c8 100644 --- a/icing/store/document-associated-score-data.h +++ b/icing/store/document-associated-score-data.h @@ -19,6 +19,7 @@ #include <type_traits> #include "icing/legacy/core/icing-packed-pod.h" +#include "icing/store/corpus-id.h" namespace icing { namespace lib { @@ -26,33 +27,46 @@ namespace lib { // This is the cache entity of document-associated scores. It contains scores // that are related to the document itself. The ground-truth data is stored // somewhere else. The cache includes: -// 1. Document score. It's defined in and passed from DocumentProto.score. +// 1. Corpus Id. +// 2. Document score. It's defined in and passed from DocumentProto.score. // Positive values are required. -// 2. Document creation timestamp. Unix timestamp of when the document is +// 3. Document creation timestamp. Unix timestamp of when the document is // created and inserted into Icing. +// 4. Document length in number of tokens. class DocumentAssociatedScoreData { public: - explicit DocumentAssociatedScoreData(int document_score, - int64_t creation_timestamp_ms) - : document_score_(document_score), - creation_timestamp_ms_(creation_timestamp_ms) {} + explicit DocumentAssociatedScoreData(CorpusId corpus_id, int document_score, + int64_t creation_timestamp_ms, + int length_in_tokens = 0) + : creation_timestamp_ms_(creation_timestamp_ms), + corpus_id_(corpus_id), + document_score_(document_score), + length_in_tokens_(length_in_tokens) {} bool operator==(const DocumentAssociatedScoreData& other) const { return document_score_ == other.document_score() && - creation_timestamp_ms_ == other.creation_timestamp_ms(); + creation_timestamp_ms_ == other.creation_timestamp_ms() && + length_in_tokens_ == other.length_in_tokens() && + corpus_id_ == other.corpus_id(); } + CorpusId corpus_id() const { return corpus_id_; } + int document_score() const { return document_score_; } int64_t creation_timestamp_ms() const { return creation_timestamp_ms_; } + int length_in_tokens() const { return length_in_tokens_; } + private: - int document_score_; int64_t creation_timestamp_ms_; + CorpusId corpus_id_; + int document_score_; + int length_in_tokens_; } __attribute__((packed)); -static_assert(sizeof(DocumentAssociatedScoreData) == 12, - "Size of DocumentAssociatedScoreData should be 12"); +static_assert(sizeof(DocumentAssociatedScoreData) == 20, + "Size of DocumentAssociatedScoreData should be 20"); static_assert(icing_is_packed_pod<DocumentAssociatedScoreData>::value, "go/icing-ubsan"); diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 6a664a3..72bf736 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -37,18 +37,20 @@ #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/logging.pb.h" #include "icing/schema/schema-store.h" +#include "icing/store/corpus-associated-scoring-data.h" #include "icing/store/corpus-id.h" #include "icing/store/document-associated-score-data.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" -#include "icing/store/enable-bm25f.h" #include "icing/store/key-mapper.h" #include "icing/store/namespace-id.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/util/clock.h" #include "icing/util/crc32.h" #include "icing/util/data-loss.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" +#include "icing/util/tokenized-document.h" namespace icing { namespace lib { @@ -61,6 +63,7 @@ constexpr char kDocumentLogFilename[] = "document_log"; constexpr char kDocumentIdMapperFilename[] = "document_id_mapper"; constexpr char kDocumentStoreHeaderFilename[] = "document_store_header"; constexpr char kScoreCacheFilename[] = "score_cache"; +constexpr char kCorpusScoreCache[] = "corpus_score_cache"; constexpr char kFilterCacheFilename[] = "filter_cache"; constexpr char kNamespaceMapperFilename[] = "namespace_mapper"; constexpr char kUsageStoreDirectoryName[] = "usage_store"; @@ -122,6 +125,10 @@ std::string MakeScoreCacheFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename); } +std::string MakeCorpusScoreCache(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache); +} + std::string MakeFilterCacheFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename); } @@ -195,8 +202,16 @@ DocumentStore::DocumentStore(const Filesystem* filesystem, document_validator_(schema_store) {} libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( - const DocumentProto& document, NativePutDocumentStats* put_document_stats) { - return Put(DocumentProto(document), put_document_stats); + const DocumentProto& document, int32_t num_tokens, + NativePutDocumentStats* put_document_stats) { + return Put(DocumentProto(document), num_tokens, put_document_stats); +} + +libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( + DocumentProto&& document, int32_t num_tokens, + NativePutDocumentStats* put_document_stats) { + document.mutable_internal_fields()->set_length_in_tokens(num_tokens); + return InternalPut(document, put_document_stats); } DocumentStore::~DocumentStore() { @@ -366,12 +381,15 @@ libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() { usage_store_, UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_))); - if (enableBm25f()) { - ICING_ASSIGN_OR_RETURN( - corpus_mapper_, KeyMapper<CorpusId>::Create( - *filesystem_, MakeCorpusMapperFilename(base_dir_), - kCorpusMapperMaxSize)); - } + ICING_ASSIGN_OR_RETURN(corpus_mapper_, + KeyMapper<CorpusId>::Create( + *filesystem_, MakeCorpusMapperFilename(base_dir_), + kCorpusMapperMaxSize)); + + ICING_ASSIGN_OR_RETURN(corpus_score_cache_, + FileBackedVector<CorpusAssociatedScoreData>::Create( + *filesystem_, MakeCorpusScoreCache(base_dir_), + MemoryMappedFile::READ_WRITE_AUTO_SYNC)); // Ensure the usage store is the correct size. ICING_RETURN_IF_ERROR( @@ -392,9 +410,8 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() { ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache()); ICING_RETURN_IF_ERROR(ResetFilterCache()); ICING_RETURN_IF_ERROR(ResetNamespaceMapper()); - if (enableBm25f()) { - ICING_RETURN_IF_ERROR(ResetCorpusMapper()); - } + ICING_RETURN_IF_ERROR(ResetCorpusMapper()); + ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache()); // Creates a new UsageStore instance. Note that we don't reset the data in // usage store here because we're not able to regenerate the usage scores. @@ -506,12 +523,6 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() { ICING_RETURN_IF_ERROR( document_id_mapper_->Set(new_document_id, iterator.GetOffset())); - ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache( - new_document_id, - DocumentAssociatedScoreData( - document_wrapper.document().score(), - document_wrapper.document().creation_timestamp_ms()))); - SchemaTypeId schema_type_id; auto schema_type_id_or = schema_store_->GetSchemaTypeId(document_wrapper.document().schema()); @@ -536,13 +547,30 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() { namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(), namespace_mapper_->num_keys())); - if (enableBm25f()) { - // Update corpus maps - std::string corpus = - MakeFingerprint(document_wrapper.document().namespace_(), - document_wrapper.document().schema()); - corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()); - } + // Update corpus maps + std::string corpus = + MakeFingerprint(document_wrapper.document().namespace_(), + document_wrapper.document().schema()); + ICING_ASSIGN_OR_RETURN( + CorpusId corpusId, + corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys())); + + ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data, + GetCorpusAssociatedScoreDataToUpdate(corpusId)); + scoring_data.AddDocument( + document_wrapper.document().internal_fields().length_in_tokens()); + + ICING_RETURN_IF_ERROR( + UpdateCorpusAssociatedScoreCache(corpusId, scoring_data)); + + ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache( + new_document_id, + DocumentAssociatedScoreData( + corpusId, document_wrapper.document().score(), + document_wrapper.document().creation_timestamp_ms(), + document_wrapper.document() + .internal_fields() + .length_in_tokens()))); int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs( document_wrapper.document().creation_timestamp_ms(), @@ -638,6 +666,18 @@ libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() { return libtextclassifier3::Status::OK; } +libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() { + // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). + corpus_score_cache_.reset(); + ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete( + *filesystem_, MakeCorpusScoreCache(base_dir_))); + ICING_ASSIGN_OR_RETURN(corpus_score_cache_, + FileBackedVector<CorpusAssociatedScoreData>::Create( + *filesystem_, MakeCorpusScoreCache(base_dir_), + MemoryMappedFile::READ_WRITE_AUTO_SYNC)); + return libtextclassifier3::Status::OK; +} + libtextclassifier3::Status DocumentStore::ResetFilterCache() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). filter_cache_.reset(); @@ -671,23 +711,21 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() { } libtextclassifier3::Status DocumentStore::ResetCorpusMapper() { - if (enableBm25f()) { - // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). - corpus_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR - // that can support error logging. - libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete( - *filesystem_, MakeCorpusMapperFilename(base_dir_)); - if (!status.ok()) { - ICING_LOG(ERROR) << status.error_message() - << "Failed to delete old corpus_id mapper"; - return status; - } - ICING_ASSIGN_OR_RETURN( - corpus_mapper_, KeyMapper<CorpusId>::Create( - *filesystem_, MakeCorpusMapperFilename(base_dir_), - kCorpusMapperMaxSize)); + // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). + corpus_mapper_.reset(); + // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // that can support error logging. + libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete( + *filesystem_, MakeCorpusMapperFilename(base_dir_)); + if (!status.ok()) { + ICING_LOG(ERROR) << status.error_message() + << "Failed to delete old corpus_id mapper"; + return status; } + ICING_ASSIGN_OR_RETURN(corpus_mapper_, + KeyMapper<CorpusId>::Create( + *filesystem_, MakeCorpusMapperFilename(base_dir_), + kCorpusMapperMaxSize)); return libtextclassifier3::Status::OK; } @@ -738,16 +776,26 @@ libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const { Crc32 namespace_mapper_checksum = namespace_mapper_->ComputeChecksum(); + Crc32 corpus_mapper_checksum = corpus_mapper_->ComputeChecksum(); + + // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN + // that can support error logging. + checksum_or = corpus_score_cache_->ComputeChecksum(); + if (!checksum_or.ok()) { + ICING_LOG(WARNING) << checksum_or.status().error_message() + << "Failed to compute checksum of score cache"; + return checksum_or.status(); + } + Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie(); + total_checksum.Append(std::to_string(document_log_checksum.Get())); total_checksum.Append(std::to_string(document_key_mapper_checksum.Get())); total_checksum.Append(std::to_string(document_id_mapper_checksum.Get())); total_checksum.Append(std::to_string(score_cache_checksum.Get())); total_checksum.Append(std::to_string(filter_cache_checksum.Get())); total_checksum.Append(std::to_string(namespace_mapper_checksum.Get())); - if (enableBm25f()) { - Crc32 corpus_mapper_checksum = corpus_mapper_->ComputeChecksum(); - total_checksum.Append(std::to_string(corpus_mapper_checksum.Get())); - } + total_checksum.Append(std::to_string(corpus_mapper_checksum.Get())); + total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get())); return total_checksum; } @@ -779,8 +827,8 @@ libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) { return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( - DocumentProto&& document, NativePutDocumentStats* put_document_stats) { +libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut( + DocumentProto& document, NativePutDocumentStats* put_document_stats) { std::unique_ptr<Timer> put_timer = clock_.GetNewTimer(); ICING_RETURN_IF_ERROR(document_validator_.Validate(document)); @@ -793,6 +841,7 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( std::string uri = document.uri(); std::string schema = document.schema(); int document_score = document.score(); + int32_t length_in_tokens = document.internal_fields().length_in_tokens(); int64_t creation_timestamp_ms = document.creation_timestamp_ms(); // Sets the creation timestamp if caller hasn't specified. @@ -829,20 +878,28 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( MakeFingerprint(name_space, uri), new_document_id)); ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset)); - ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache( - new_document_id, - DocumentAssociatedScoreData(document_score, creation_timestamp_ms))); - // Update namespace maps ICING_ASSIGN_OR_RETURN( NamespaceId namespace_id, namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys())); - if (enableBm25f()) { - // Update corpus maps - ICING_RETURN_IF_ERROR(corpus_mapper_->GetOrPut( - MakeFingerprint(name_space, schema), corpus_mapper_->num_keys())); - } + // Update corpus maps + ICING_ASSIGN_OR_RETURN( + CorpusId corpusId, + corpus_mapper_->GetOrPut(MakeFingerprint(name_space, schema), + corpus_mapper_->num_keys())); + + ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data, + GetCorpusAssociatedScoreDataToUpdate(corpusId)); + scoring_data.AddDocument(length_in_tokens); + + ICING_RETURN_IF_ERROR( + UpdateCorpusAssociatedScoreCache(corpusId, scoring_data)); + + ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache( + new_document_id, + DocumentAssociatedScoreData(corpusId, document_score, + creation_timestamp_ms, length_in_tokens))); ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id, schema_store_->GetSchemaTypeId(schema)); @@ -876,7 +933,8 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( } libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get( - const std::string_view name_space, const std::string_view uri) const { + const std::string_view name_space, const std::string_view uri, + bool clear_internal_fields) const { // TODO(b/147231617): Make a better way to replace the error message in an // existing Status. auto document_id_or = GetDocumentId(name_space, uri); @@ -903,7 +961,7 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get( } libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get( - DocumentId document_id) const { + DocumentId document_id, bool clear_internal_fields) const { ICING_ASSIGN_OR_RETURN(int64_t document_log_offset, DoesDocumentExistAndGetFileOffset(document_id)); @@ -917,6 +975,9 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get( } DocumentWrapper document_wrapper = std::move(document_wrapper_or).ValueOrDie(); + if (clear_internal_fields) { + document_wrapper.mutable_document()->clear_internal_fields(); + } return std::move(*document_wrapper.mutable_document()); } @@ -1088,10 +1149,7 @@ libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId( libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId( const std::string_view name_space, const std::string_view schema) const { - if (enableBm25f()) { - return corpus_mapper_->Get(MakeFingerprint(name_space, schema)); - } - return absl_ports::NotFoundError("corpus_mapper disabled"); + return corpus_mapper_->Get(MakeFingerprint(name_space, schema)); } libtextclassifier3::StatusOr<DocumentAssociatedScoreData> @@ -1112,6 +1170,34 @@ DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const { return document_associated_score_data; } +libtextclassifier3::StatusOr<CorpusAssociatedScoreData> +DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const { + auto score_data_or = corpus_score_cache_->Get(corpus_id); + if (!score_data_or.ok()) { + return score_data_or.status(); + } + + CorpusAssociatedScoreData corpus_associated_score_data = + *std::move(score_data_or).ValueOrDie(); + return corpus_associated_score_data; +} + +libtextclassifier3::StatusOr<CorpusAssociatedScoreData> +DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const { + auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id); + if (corpus_scoring_data_or.ok()) { + return std::move(corpus_scoring_data_or).ValueOrDie(); + } + CorpusAssociatedScoreData scoringData; + // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to + // corpus_score_cache_ for the first time. + if (corpus_scoring_data_or.status().CanonicalCode() == + libtextclassifier3::StatusCode::OUT_OF_RANGE) { + return scoringData; + } + return corpus_scoring_data_or.status(); +} + libtextclassifier3::StatusOr<DocumentFilterData> DocumentStore::GetDocumentFilterData(DocumentId document_id) const { auto filter_data_or = filter_cache_->Get(document_id); @@ -1308,10 +1394,8 @@ libtextclassifier3::Status DocumentStore::PersistToDisk() { ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk()); ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk()); ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk()); - - if (enableBm25f()) { - ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk()); - } + ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk()); + ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk()); // Update the combined checksum and write to header file. ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum()); @@ -1333,16 +1417,16 @@ libtextclassifier3::StatusOr<int64_t> DocumentStore::GetDiskUsage() const { filter_cache_->GetDiskUsage()); ICING_ASSIGN_OR_RETURN(const int64_t namespace_mapper_disk_usage, namespace_mapper_->GetDiskUsage()); + ICING_ASSIGN_OR_RETURN(const int64_t corpus_mapper_disk_usage, + corpus_mapper_->GetDiskUsage()); + ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_disk_usage, + corpus_score_cache_->GetDiskUsage()); int64_t disk_usage = document_log_disk_usage + document_key_mapper_disk_usage + document_id_mapper_disk_usage + score_cache_disk_usage + - filter_cache_disk_usage + namespace_mapper_disk_usage; - if (enableBm25f()) { - ICING_ASSIGN_OR_RETURN(const int64_t corpus_mapper_disk_usage, - corpus_mapper_->GetDiskUsage()); - disk_usage += corpus_mapper_disk_usage; - } + filter_cache_disk_usage + namespace_mapper_disk_usage + + corpus_mapper_disk_usage + corpus_score_cache_disk_usage; return disk_usage; } @@ -1493,7 +1577,7 @@ libtextclassifier3::Status DocumentStore::Optimize() { } libtextclassifier3::Status DocumentStore::OptimizeInto( - const std::string& new_directory) { + const std::string& new_directory, const LanguageSegmenter* lang_segmenter) { // Validates directory if (new_directory == base_dir_) { return absl_ports::InvalidArgumentError( @@ -1509,7 +1593,7 @@ libtextclassifier3::Status DocumentStore::OptimizeInto( // Writes all valid docs into new document store (new directory) int size = document_id_mapper_->num_elements(); for (DocumentId document_id = 0; document_id < size; document_id++) { - auto document_or = Get(document_id); + auto document_or = Get(document_id, /*clear_internal_fields=*/false); if (absl_ports::IsNotFound(document_or.status())) { // Skip nonexistent documents continue; @@ -1523,9 +1607,26 @@ libtextclassifier3::Status DocumentStore::OptimizeInto( // Guaranteed to have a document now. DocumentProto document_to_keep = document_or.ValueOrDie(); - // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN - // that can support error logging. - auto new_document_id_or = new_doc_store->Put(std::move(document_to_keep)); + + libtextclassifier3::StatusOr<DocumentId> new_document_id_or; + if (document_to_keep.internal_fields().length_in_tokens() == 0) { + auto tokenized_document_or = TokenizedDocument::Create( + schema_store_, lang_segmenter, document_to_keep); + if (!tokenized_document_or.ok()) { + return absl_ports::Annotate( + tokenized_document_or.status(), + IcingStringUtil::StringPrintf( + "Failed to tokenize Document for DocumentId %d", document_id)); + } + TokenizedDocument tokenized_document( + std::move(tokenized_document_or).ValueOrDie()); + new_document_id_or = + new_doc_store->Put(document_to_keep, tokenized_document.num_tokens()); + } else { + // TODO(b/144458732): Implement a more robust version of + // TC_ASSIGN_OR_RETURN that can support error logging. + new_document_id_or = new_doc_store->InternalPut(document_to_keep); + } if (!new_document_id_or.ok()) { ICING_LOG(ERROR) << new_document_id_or.status().error_message() << "Failed to write into new document store"; @@ -1577,26 +1678,39 @@ DocumentStore::GetOptimizeInfo() const { score_cache_->GetElementsFileSize()); ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size, filter_cache_->GetElementsFileSize()); + ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size, + corpus_score_cache_->GetElementsFileSize()); + + // Usage store might be sparse, but we'll still use file size for more + // accurate counting. + ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size, + usage_store_->GetElementsFileSize()); // We use a combined disk usage and file size for the KeyMapper because it's // backed by a trie, which has some sparse property bitmaps. ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size, document_key_mapper_->GetElementsSize()); - // We don't include the namespace mapper because it's not clear if we could - // recover any space even if Optimize were called. Deleting 100s of documents - // could still leave a few documents of a namespace, and then there would be - // no change. + // We don't include the namespace_mapper or the corpus_mapper because it's not + // clear if we could recover any space even if Optimize were called. Deleting + // 100s of documents could still leave a few documents of a namespace, and + // then there would be no change. int64_t total_size = document_log_file_size + document_key_mapper_size + document_id_mapper_file_size + score_cache_file_size + - filter_cache_file_size; + filter_cache_file_size + corpus_score_cache_file_size + + usage_store_file_size; optimize_info.estimated_optimizable_bytes = total_size * optimize_info.optimizable_docs / optimize_info.total_docs; return optimize_info; } +libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache( + CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) { + return corpus_score_cache_->Set(corpus_id, score_data); +} + libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache( DocumentId document_id, const DocumentAssociatedScoreData& score_data) { return score_cache_->Set(document_id, score_data); @@ -1617,8 +1731,10 @@ libtextclassifier3::Status DocumentStore::ClearDerivedData( // Resets the score cache entry ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache( - document_id, DocumentAssociatedScoreData(/*document_score=*/-1, - /*creation_timestamp_ms=*/-1))); + document_id, DocumentAssociatedScoreData(kInvalidCorpusId, + /*document_score=*/-1, + /*creation_timestamp_ms=*/-1, + /*length_in_tokens=*/0))); // Resets the filter cache entry ICING_RETURN_IF_ERROR(UpdateFilterCache( diff --git a/icing/store/document-store.h b/icing/store/document-store.h index 78590a5..b2908f0 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -30,6 +30,7 @@ #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/logging.pb.h" #include "icing/schema/schema-store.h" +#include "icing/store/corpus-associated-scoring-data.h" #include "icing/store/corpus-id.h" #include "icing/store/document-associated-score-data.h" #include "icing/store/document-filter-data.h" @@ -37,6 +38,7 @@ #include "icing/store/key-mapper.h" #include "icing/store/namespace-id.h" #include "icing/store/usage-store.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/util/clock.h" #include "icing/util/crc32.h" #include "icing/util/data-loss.h" @@ -149,23 +151,27 @@ class DocumentStore { // exist in schema // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<DocumentId> Put( - const DocumentProto& document, + const DocumentProto& document, int32_t num_tokens = 0, NativePutDocumentStats* put_document_stats = nullptr); libtextclassifier3::StatusOr<DocumentId> Put( - DocumentProto&& document, + DocumentProto&& document, int32_t num_tokens = 0, NativePutDocumentStats* put_document_stats = nullptr); // Finds and returns the document identified by the given key (namespace + - // uri) + // uri). If 'clear_internal_fields' is true, document level data that's + // generated internally by DocumentStore is cleared. // // Returns: // The document found on success // NOT_FOUND if the key doesn't exist or document has been deleted // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<DocumentProto> Get(std::string_view name_space, - std::string_view uri) const; + libtextclassifier3::StatusOr<DocumentProto> Get( + std::string_view name_space, std::string_view uri, + bool clear_internal_fields = true) const; - // Finds and returns the document identified by the given document id + // Finds and returns the document identified by the given document id. If + // 'clear_internal_fields' is true, document level data that's generated + // internally by DocumentStore is cleared. // // Returns: // The document found on success @@ -173,7 +179,8 @@ class DocumentStore { // maximum value // NOT_FOUND if the document doesn't exist or has been deleted // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<DocumentProto> Get(DocumentId document_id) const; + libtextclassifier3::StatusOr<DocumentProto> Get( + DocumentId document_id, bool clear_internal_fields = true) const; // Returns all namespaces which have at least 1 active document (not deleted // or expired). Order of namespaces is undefined. @@ -256,6 +263,20 @@ class DocumentStore { libtextclassifier3::StatusOr<DocumentAssociatedScoreData> GetDocumentAssociatedScoreData(DocumentId document_id) const; + // Returns the CorpusAssociatedScoreData of the corpus specified by the + // corpus_id. + // + // NOTE: This does not check if the corpus exists and will return the + // CorpusAssociatedScoreData of the corpus even if all documents belonging to + // that corpus have been deleted. + // + // Returns: + // CorpusAssociatedScoreData on success + // OUT_OF_RANGE if corpus_id is negative or exceeds previously seen + // CorpusIds + libtextclassifier3::StatusOr<CorpusAssociatedScoreData> + GetCorpusAssociatedScoreData(CorpusId corpus_id) const; + // Returns the DocumentFilterData of the document specified by the DocumentId. // // NOTE: This does not check if the document exists and will return the @@ -394,7 +415,9 @@ class DocumentStore { // OK on success // INVALID_ARGUMENT if new_directory is same as current base directory // INTERNAL_ERROR on IO error - libtextclassifier3::Status OptimizeInto(const std::string& new_directory); + libtextclassifier3::Status OptimizeInto( + const std::string& new_directory, + const LanguageSegmenter* lang_segmenter); // Calculates status for a potential Optimize call. Includes how many docs // there are vs how many would be optimized away. And also includes an @@ -441,8 +464,10 @@ class DocumentStore { // A cache of document associated scores. The ground truth of the scores is // DocumentProto stored in document_log_. This cache contains: + // - CorpusId // - Document score // - Document creation timestamp in seconds + // - Document length in number of tokens std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_; // A cache of data, indexed by DocumentId, used to filter documents. Currently @@ -452,6 +477,13 @@ class DocumentStore { // - Expiration timestamp in seconds std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_; + // A cache of corpus associated scores. The ground truth of the scores is + // DocumentProto stored in document_log_. This cache contains: + // - Number of documents belonging to the corpus score + // - The sum of the documents' lengths, in number of tokens. + std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>> + corpus_score_cache_; + // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an // id when the first document belonging to that namespace is added to the // DocumentStore. Namespaces may be removed from the mapper during compaction. @@ -516,6 +548,12 @@ class DocumentStore { // Returns OK or any IO errors. libtextclassifier3::Status ResetDocumentAssociatedScoreCache(); + // Resets the unique_ptr to the corpus_score_cache, deletes the underlying + // file, and re-creates a new instance of the corpus_score_cache. + // + // Returns OK or any IO errors. + libtextclassifier3::Status ResetCorpusAssociatedScoreCache(); + // Resets the unique_ptr to the filter_cache, deletes the underlying file, and // re-creates a new instance of the filter_cache. // @@ -546,6 +584,10 @@ class DocumentStore { // INTERNAL on I/O error libtextclassifier3::Status UpdateHeader(const Crc32& checksum); + libtextclassifier3::StatusOr<DocumentId> InternalPut( + DocumentProto& document, + NativePutDocumentStats* put_document_stats = nullptr); + // Helper function to do batch deletes. Documents with the given // "namespace_id" and "schema_type_id" will be deleted. If callers don't need // to specify the namespace or schema type, pass in kInvalidNamespaceId or @@ -597,6 +639,21 @@ class DocumentStore { libtextclassifier3::StatusOr<DocumentId> GetDocumentId( std::string_view name_space, std::string_view uri) const; + // Returns the CorpusAssociatedScoreData of the corpus specified by the + // corpus_id. + // + // If the corpus_id has never been seen before, it returns a + // CorpusAssociatedScoreData with properties set to default values. + // + // NOTE: This does not check if the corpus exists and will return the + // CorpusAssociatedScoreData of the corpus even if all documents belonging to + // that corpus have been deleted. + // + // Returns: + // CorpusAssociatedScoreData on success + libtextclassifier3::StatusOr<CorpusAssociatedScoreData> + GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const; + // Helper method to validate the document id and return the file offset of the // associated document in document_log_. // @@ -617,6 +674,10 @@ class DocumentStore { libtextclassifier3::Status UpdateDocumentAssociatedScoreCache( DocumentId document_id, const DocumentAssociatedScoreData& score_data); + // Updates the entry in the corpus score cache for corpus_id. + libtextclassifier3::Status UpdateCorpusAssociatedScoreCache( + CorpusId corpus_id, const CorpusAssociatedScoreData& score_data); + // Updates the entry in the filter cache for document_id. libtextclassifier3::Status UpdateFilterCache( DocumentId document_id, const DocumentFilterData& filter_data); diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index 29bf8bb..7754373 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -27,20 +27,25 @@ #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/file/mock-filesystem.h" +#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/schema/schema-store.h" +#include "icing/store/corpus-associated-scoring-data.h" +#include "icing/store/corpus-id.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" -#include "icing/store/enable-bm25f.h" #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/platform.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/util/crc32.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -101,7 +106,19 @@ class DocumentStoreTest : public ::testing::Test { } void SetUp() override { - setEnableBm25f(true); + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + // If we've specified using the reverse-JNI method for segmentation (i.e. + // not ICU), then we won't have the ICU data file included to set up. + // Technically, we could choose to use reverse-JNI for segmentation AND + // include an ICU data file, but that seems unlikely and our current BUILD + // setup doesn't do this. + // File generated via icu_data_file rule in //icing/BUILD. + std::string icu_data_file_path = + GetTestFilePath("icing/icu.dat"); + ICING_ASSERT_OK( + icu_data_file_helper::SetUpICUDataFile(icu_data_file_path)); + } + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()); @@ -133,6 +150,11 @@ class DocumentStoreTest : public ::testing::Test { schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); + + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(segmenter_options))); } void TearDown() override { @@ -147,6 +169,7 @@ class DocumentStoreTest : public ::testing::Test { DocumentProto test_document1_; DocumentProto test_document2_; std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<LanguageSegmenter> lang_segmenter_; // Document1 values const int document1_score_ = 1; @@ -1184,9 +1207,10 @@ TEST_F(DocumentStoreTest, OptimizeInto) { filesystem_.GetFileSize(original_document_log.c_str()); // Optimizing into the same directory is not allowed - EXPECT_THAT(doc_store->OptimizeInto(document_store_dir_), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, - HasSubstr("directory is the same"))); + EXPECT_THAT( + doc_store->OptimizeInto(document_store_dir_, lang_segmenter_.get()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, + HasSubstr("directory is the same"))); std::string optimized_dir = document_store_dir_ + "_optimize"; std::string optimized_document_log = optimized_dir + "/document_log"; @@ -1195,7 +1219,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) { // deleted ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); - ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir)); + ICING_ASSERT_OK( + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); int64_t optimized_size1 = filesystem_.GetFileSize(optimized_document_log.c_str()); EXPECT_EQ(original_size, optimized_size1); @@ -1205,7 +1230,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) { ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); ICING_ASSERT_OK(doc_store->Delete("namespace", "uri1")); - ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir)); + ICING_ASSERT_OK( + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); int64_t optimized_size2 = filesystem_.GetFileSize(optimized_document_log.c_str()); EXPECT_THAT(original_size, Gt(optimized_size2)); @@ -1218,7 +1244,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) { // expired ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); - ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir)); + ICING_ASSERT_OK( + doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); int64_t optimized_size3 = filesystem_.GetFileSize(optimized_document_log.c_str()); EXPECT_THAT(optimized_size2, Gt(optimized_size3)); @@ -1235,14 +1262,32 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(document_id1, - doc_store->Put(DocumentProto(test_document1_))); - ICING_ASSERT_OK_AND_ASSIGN(document_id2, - doc_store->Put(DocumentProto(test_document2_))); + ICING_ASSERT_OK_AND_ASSIGN( + document_id1, + doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4)); + ICING_ASSERT_OK_AND_ASSIGN( + document_id2, + doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4)); EXPECT_THAT(doc_store->Get(document_id1), IsOkAndHolds(EqualsProto(test_document1_))); EXPECT_THAT(doc_store->Get(document_id2), IsOkAndHolds(EqualsProto(test_document2_))); + // Checks derived score cache + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id1), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document1_score_, document1_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id2), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document2_score_, document2_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/2, /*sum_length_in_tokens=*/8))); + + // Delete document 1 EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk()); EXPECT_THAT(doc_store->Get(document_id1), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -1281,9 +1326,14 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) { /*namespace_id=*/0, /*schema_type_id=*/0, document2_expiration_timestamp_))); // Checks derived score cache - EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2), - IsOkAndHolds(DocumentAssociatedScoreData( - document2_score_, document2_creation_timestamp_))); + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id2), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document2_score_, document2_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/1, /*sum_length_in_tokens=*/4))); } TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { @@ -1297,14 +1347,31 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(document_id1, - doc_store->Put(DocumentProto(test_document1_))); - ICING_ASSERT_OK_AND_ASSIGN(document_id2, - doc_store->Put(DocumentProto(test_document2_))); + ICING_ASSERT_OK_AND_ASSIGN( + document_id1, + doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4)); + ICING_ASSERT_OK_AND_ASSIGN( + document_id2, + doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4)); EXPECT_THAT(doc_store->Get(document_id1), IsOkAndHolds(EqualsProto(test_document1_))); EXPECT_THAT(doc_store->Get(document_id2), IsOkAndHolds(EqualsProto(test_document2_))); + // Checks derived score cache + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id1), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document1_score_, document1_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id2), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document2_score_, document2_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/2, /*sum_length_in_tokens=*/8))); + // Delete document 1 EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk()); EXPECT_THAT(doc_store->Get(document_id1), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -1328,6 +1395,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { IsOk()); // Successfully recover from a corrupt derived file issue. + // NOTE: this doesn't trigger RegenerateDerivedFiles. ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1345,10 +1413,16 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) { IsOkAndHolds(DocumentFilterData( /*namespace_id=*/0, /*schema_type_id=*/0, document2_expiration_timestamp_))); - // Checks derived score cache - EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2), - IsOkAndHolds(DocumentAssociatedScoreData( - document2_score_, document2_creation_timestamp_))); + // Checks derived score cache - note that they aren't regenerated from + // scratch. + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id2), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document2_score_, document2_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/2, /*sum_length_in_tokens=*/8))); } TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { @@ -1362,14 +1436,30 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(document_id1, - doc_store->Put(DocumentProto(test_document1_))); - ICING_ASSERT_OK_AND_ASSIGN(document_id2, - doc_store->Put(DocumentProto(test_document2_))); + ICING_ASSERT_OK_AND_ASSIGN( + document_id1, + doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4)); + ICING_ASSERT_OK_AND_ASSIGN( + document_id2, + doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4)); EXPECT_THAT(doc_store->Get(document_id1), IsOkAndHolds(EqualsProto(test_document1_))); EXPECT_THAT(doc_store->Get(document_id2), IsOkAndHolds(EqualsProto(test_document2_))); + // Checks derived score cache + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id1), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document1_score_, document1_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id2), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document2_score_, document2_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/2, /*sum_length_in_tokens=*/8))); EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk()); EXPECT_THAT(doc_store->Get(document_id1), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -1407,9 +1497,14 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { /*namespace_id=*/0, /*schema_type_id=*/0, document2_expiration_timestamp_))); // Checks derived score cache - EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2), - IsOkAndHolds(DocumentAssociatedScoreData( - document2_score_, document2_creation_timestamp_))); + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id2), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document2_score_, document2_creation_timestamp_, + /*length_in_tokens=*/4))); + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/1, /*sum_length_in_tokens=*/4))); } TEST_F(DocumentStoreTest, GetDiskUsage) { @@ -1544,28 +1639,6 @@ TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, GetCorpusIdReturnsNotFoundWhenFeatureIsDisabled) { - setEnableBm25f(false); - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> doc_store = - std::move(create_result.document_store); - - DocumentProto document1 = - DocumentBuilder().SetKey("namespace", "1").SetSchema("email").Build(); - DocumentProto document2 = - DocumentBuilder().SetKey("namespace", "2").SetSchema("email").Build(); - - ICING_ASSERT_OK(doc_store->Put(document1)); - ICING_ASSERT_OK(doc_store->Put(document2)); - - EXPECT_THAT(doc_store->GetCorpusId("namespace", "email"), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, - HasSubstr("corpus_mapper disabled"))); -} - TEST_F(DocumentStoreTest, GetCorpusDuplicateCorpusId) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -1582,7 +1655,7 @@ TEST_F(DocumentStoreTest, GetCorpusDuplicateCorpusId) { ICING_ASSERT_OK(doc_store->Put(document1)); ICING_ASSERT_OK(doc_store->Put(document2)); - // NamespaceId of 0 since it was the first namespace seen by the DocumentStore + // CorpusId of 0 since it was the first namespace seen by the DocumentStore EXPECT_THAT(doc_store->GetCorpusId("namespace", "email"), IsOkAndHolds(Eq(0))); } @@ -1642,6 +1715,183 @@ TEST_F(DocumentStoreTest, NonexistentCorpusNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(doc_store->GetCorpusId("namespace1", "nonexistent_schema"), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); +} + +TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "1").SetSchema("email").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "2").SetSchema("email").Build(); + + ICING_ASSERT_OK(doc_store->Put(document1, /*num_tokens=*/5)); + ICING_ASSERT_OK(doc_store->Put(document2, /*num_tokens=*/7)); + + // CorpusId of 0 since it was the first namespace seen by the DocumentStore + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/2, /*sum_length_in_tokens=*/12))); + // Only one corpus exists + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); +} + +TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreData) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentProto document_corpus1 = + DocumentBuilder().SetKey("namespace1", "1").SetSchema("email").Build(); + DocumentProto document_corpus2 = + DocumentBuilder().SetKey("namespace2", "2").SetSchema("email").Build(); + + ICING_ASSERT_OK( + doc_store->Put(DocumentProto(document_corpus1), /*num_tokens=*/5)); + ICING_ASSERT_OK( + doc_store->Put(DocumentProto(document_corpus2), /*num_tokens=*/7)); + + // CorpusId of 0 since it was the first corpus seen by the DocumentStore + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/1, /*sum_length_in_tokens=*/5))); + + // CorpusId of 1 since it was the second corpus seen by the + // DocumentStore + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/1, /*sum_length_in_tokens=*/7))); + + // DELETE namespace1 - document_corpus1 is deleted. + ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace1").status); + + // Corpus score cache doesn't care if the document has been deleted + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + IsOkAndHolds(CorpusAssociatedScoreData( + /*num_docs=*/1, /*sum_length_in_tokens=*/5))); +} + +TEST_F(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); +} + +TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("email") + .SetScore(document1_score_) + .SetCreationTimestampMs( + document1_creation_timestamp_) // A random timestamp + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace", "2") + .SetSchema("email") + .SetScore(document2_score_) + .SetCreationTimestampMs( + document2_creation_timestamp_) // A random timestamp + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + doc_store->Put(DocumentProto(document1), /*num_tokens=*/5)); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + doc_store->Put(DocumentProto(document2), /*num_tokens=*/7)); + + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id1), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document1_score_, document1_creation_timestamp_, + /*length_in_tokens=*/5))); + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id2), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document2_score_, document2_creation_timestamp_, + /*length_in_tokens=*/7))); +} + +TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataDifferentCorpus) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "1") + .SetSchema("email") + .SetScore(document1_score_) + .SetCreationTimestampMs( + document1_creation_timestamp_) // A random timestamp + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace2", "2") + .SetSchema("email") + .SetScore(document2_score_) + .SetCreationTimestampMs( + document2_creation_timestamp_) // A random timestamp + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + doc_store->Put(DocumentProto(document1), /*num_tokens=*/5)); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + doc_store->Put(DocumentProto(document2), /*num_tokens=*/7)); + + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id1), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, document1_score_, document1_creation_timestamp_, + /*length_in_tokens=*/5))); + EXPECT_THAT( + doc_store->GetDocumentAssociatedScoreData(document_id2), + IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/1, document2_score_, document2_creation_timestamp_, + /*length_in_tokens=*/7))); +} + +TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataOutOfRange) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(/*document_id=*/0), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) { @@ -1700,12 +1950,13 @@ TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearScoreCache) { std::move(create_result.document_store); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, - doc_store->Put(test_document1_)); + doc_store->Put(test_document1_, /*num_tokens=*/4)); EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id), IsOkAndHolds(DocumentAssociatedScoreData( - /*document_score=*/document1_score_, - /*creation_timestamp_ms=*/document1_creation_timestamp_))); + /*corpus_id=*/0, /*document_score=*/document1_score_, + /*creation_timestamp_ms=*/document1_creation_timestamp_, + /*length_in_tokens=*/4))); ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true)); // Associated entry of the deleted document is removed. @@ -1722,12 +1973,14 @@ TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) { std::move(create_result.document_store); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, - doc_store->Put(test_document1_)); + doc_store->Put(test_document1_, /*num_tokens=*/4)); EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id), IsOkAndHolds(DocumentAssociatedScoreData( + /*corpus_id=*/0, /*document_score=*/document1_score_, - /*creation_timestamp_ms=*/document1_creation_timestamp_))); + /*creation_timestamp_ms=*/document1_creation_timestamp_, + /*length_in_tokens=*/4))); ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false)); // Associated entry of the deleted document is removed. @@ -1931,11 +2184,15 @@ TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) { EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id1), IsOkAndHolds(DocumentAssociatedScoreData( - /*document_score=*/0, /*creation_timestamp_ms=*/0))); + /*corpus_id=*/0, + /*document_score=*/0, /*creation_timestamp_ms=*/0, + /*length_in_tokens=*/0))); EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2), IsOkAndHolds(DocumentAssociatedScoreData( - /*document_score=*/5, /*creation_timestamp_ms=*/0))); + /*corpus_id=*/0, + /*document_score=*/5, /*creation_timestamp_ms=*/0, + /*length_in_tokens=*/0))); } TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) { @@ -2636,7 +2893,8 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) { std::string optimized_dir = document_store_dir_ + "_optimize"; EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str())); EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str())); - ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir)); + ICING_ASSERT_OK( + document_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); document_store.reset(); ICING_ASSERT_OK_AND_ASSIGN( create_result, DocumentStore::Create(&filesystem_, optimized_dir, @@ -3046,7 +3304,8 @@ TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { // Run optimize std::string optimized_dir = document_store_dir_ + "/optimize_test"; filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()); - ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir)); + ICING_ASSERT_OK( + document_store->OptimizeInto(optimized_dir, lang_segmenter_.get())); // Get optimized document store ICING_ASSERT_OK_AND_ASSIGN( @@ -3149,9 +3408,9 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // the current code is compatible with the format of the v0 scoring_cache, // then an empty document store should be initialized, but the non-empty // scoring_cache should be retained. - // Since the current document-asscoiated-score-data is compatible with the - // score_cache in testdata/v0/document_store, the document store should be - // initialized without having to re-generate the derived files. + // The current document-asscoiated-score-data has a new field with respect to + // the ones stored in testdata/v0, hence the document store's initialization + // requires regenerating its derived files. // Create dst directory ASSERT_THAT(filesystem_.CreateDirectory(document_store_dir_.c_str()), true); @@ -3186,9 +3445,10 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { schema_store_.get(), &initializeStats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - // Regeneration never happens. - EXPECT_EQ(initializeStats.document_store_recovery_cause(), - NativeInitializeStats::NONE); + // The store_cache trigger regeneration because its element size is + // inconsistent: expected 20 (current new size), actual 12 (as per the v0 + // score_cache). + EXPECT_TRUE(initializeStats.has_document_store_recovery_cause()); } } // namespace diff --git a/icing/store/enable-bm25f.h b/icing/store/enable-bm25f.h deleted file mode 100644 index cee94d1..0000000 --- a/icing/store/enable-bm25f.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2020 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_STORE_ENABLE_BM25F_H_ -#define ICING_STORE_ENABLE_BM25F_H_ - -namespace icing { -namespace lib { - -inline bool enable_bm25f_ = false; - -inline bool enableBm25f() { return enable_bm25f_; } - -// Setter for testing purposes. It should never be called in production code. -inline void setEnableBm25f(bool enable_bm25f) { enable_bm25f_ = enable_bm25f; } - -} // namespace lib -} // namespace icing - -#endif // ICING_STORE_ENABLE_BM25F_H_ diff --git a/icing/store/usage-store.cc b/icing/store/usage-store.cc index 7a0af9c..54896dc 100644 --- a/icing/store/usage-store.cc +++ b/icing/store/usage-store.cc @@ -214,6 +214,10 @@ libtextclassifier3::StatusOr<Crc32> UsageStore::ComputeChecksum() { return usage_score_cache_->ComputeChecksum(); } +libtextclassifier3::StatusOr<int64_t> UsageStore::GetElementsFileSize() const { + return usage_score_cache_->GetElementsFileSize(); +} + libtextclassifier3::Status UsageStore::TruncateTo(DocumentId num_documents) { if (num_documents >= usage_score_cache_->num_elements()) { // No need to truncate diff --git a/icing/store/usage-store.h b/icing/store/usage-store.h index 0a622a0..b7de970 100644 --- a/icing/store/usage-store.h +++ b/icing/store/usage-store.h @@ -148,6 +148,15 @@ class UsageStore { // INTERNAL_ERROR if the internal state is inconsistent libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); + // Returns the file size of the all the elements held in the UsageStore. File + // size is in bytes. This excludes the size of any internal metadata, e.g. any + // internal headers. + // + // Returns: + // File size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; + // Resizes the storage so that only the usage scores of and before // last_document_id are stored. // diff --git a/icing/store/usage-store_test.cc b/icing/store/usage-store_test.cc index f7fa778..220c226 100644 --- a/icing/store/usage-store_test.cc +++ b/icing/store/usage-store_test.cc @@ -24,6 +24,7 @@ namespace lib { namespace { using ::testing::Eq; +using ::testing::Gt; using ::testing::Not; class UsageStoreTest : public testing::Test { @@ -560,6 +561,22 @@ TEST_F(UsageStoreTest, StoreShouldBeResetOnHeaderChecksumMismatch) { IsOkAndHolds(UsageStore::UsageScores())); } +TEST_F(UsageStoreTest, GetElementsFileSize) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_file_size, + usage_store->GetElementsFileSize()); + EXPECT_THAT(empty_file_size, Eq(0)); + + UsageReport usage_report = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1); + usage_store->AddUsageReport(usage_report, /*document_id=*/1); + + EXPECT_THAT(usage_store->GetElementsFileSize(), + IsOkAndHolds(Gt(empty_file_size))); +} + } // namespace } // namespace lib |