aboutsummaryrefslogtreecommitdiff
path: root/icing/store
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2021-01-14 20:53:07 +0000
committerTim Barron <tjbarron@google.com>2021-01-14 20:53:07 +0000
commita34db390d80f862bfaaa49dea3605c5fec3bca3d (patch)
tree67a4a87803cf2b31619c3ddff3674967fc1461ce /icing/store
parent59c2caa38fd8dca3760dad751f4f8e5de8be25f5 (diff)
downloadicing-a34db390d80f862bfaaa49dea3605c5fec3bca3d.tar.gz
Update Icing from upstream.
Change-Id: I43038a59e7170fb8ecbaf6098a37221b3682ce09
Diffstat (limited to 'icing/store')
-rw-r--r--icing/store/corpus-associated-scoring-data.h79
-rw-r--r--icing/store/corpus-id.h2
-rw-r--r--icing/store/document-associated-score-data.h34
-rw-r--r--icing/store/document-store.cc284
-rw-r--r--icing/store/document-store.h77
-rw-r--r--icing/store/document-store_test.cc396
-rw-r--r--icing/store/enable-bm25f.h31
-rw-r--r--icing/store/usage-store.cc4
-rw-r--r--icing/store/usage-store.h9
-rw-r--r--icing/store/usage-store_test.cc17
10 files changed, 732 insertions, 201 deletions
diff --git a/icing/store/corpus-associated-scoring-data.h b/icing/store/corpus-associated-scoring-data.h
new file mode 100644
index 0000000..52be5cd
--- /dev/null
+++ b/icing/store/corpus-associated-scoring-data.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_
+#define ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_
+
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "icing/legacy/core/icing-packed-pod.h"
+
+namespace icing {
+namespace lib {
+
+// This is the cache entity of corpus-associated scores. The ground-truth data
+// is stored somewhere else. The cache includes:
+// 1. Number of documents contained in the corpus.
+// Positive values are required.
+// 2. The sum of the documents' lengths, in number of tokens.
+class CorpusAssociatedScoreData {
+ public:
+ explicit CorpusAssociatedScoreData(int num_docs = 0,
+ int64_t sum_length_in_tokens = 0)
+ : sum_length_in_tokens_(sum_length_in_tokens), num_docs_(num_docs) {}
+
+ bool operator==(const CorpusAssociatedScoreData& other) const {
+ return num_docs_ == other.num_docs() &&
+ sum_length_in_tokens_ == other.sum_length_in_tokens();
+ }
+
+ uint32_t num_docs() const { return num_docs_; }
+ void set_num_docs(uint32_t val) { num_docs_ = val; }
+
+ uint64_t sum_length_in_tokens() const { return sum_length_in_tokens_; }
+ void set_sum_length_in_tokens(uint64_t val) { sum_length_in_tokens_ = val; }
+
+ float average_doc_length_in_tokens() const {
+ return sum_length_in_tokens_ / (1.0f + num_docs_);
+ }
+
+ // Adds a new document.
+ // Adds the document's length to the total length of the corpus,
+ // sum_length_in_tokens_.
+ void AddDocument(uint32_t doc_length_in_tokens) {
+ ++num_docs_;
+ sum_length_in_tokens_ =
+ (std::numeric_limits<int>::max() - doc_length_in_tokens <
+ sum_length_in_tokens_)
+ ? std::numeric_limits<int>::max()
+ : sum_length_in_tokens_ + doc_length_in_tokens;
+ }
+
+ private:
+ // The sum total of the length of all documents in the corpus.
+ int sum_length_in_tokens_;
+ int num_docs_;
+} __attribute__((packed));
+
+static_assert(sizeof(CorpusAssociatedScoreData) == 8,
+ "Size of CorpusAssociatedScoreData should be 8");
+static_assert(icing_is_packed_pod<CorpusAssociatedScoreData>::value,
+ "go/icing-ubsan");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_
diff --git a/icing/store/corpus-id.h b/icing/store/corpus-id.h
index a8f21ba..01135b9 100644
--- a/icing/store/corpus-id.h
+++ b/icing/store/corpus-id.h
@@ -24,6 +24,8 @@ namespace lib {
// DocumentProto. Generated in DocumentStore.
using CorpusId = int32_t;
+inline constexpr CorpusId kInvalidCorpusId = -1;
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/document-associated-score-data.h b/icing/store/document-associated-score-data.h
index b9039c5..9a711c8 100644
--- a/icing/store/document-associated-score-data.h
+++ b/icing/store/document-associated-score-data.h
@@ -19,6 +19,7 @@
#include <type_traits>
#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/store/corpus-id.h"
namespace icing {
namespace lib {
@@ -26,33 +27,46 @@ namespace lib {
// This is the cache entity of document-associated scores. It contains scores
// that are related to the document itself. The ground-truth data is stored
// somewhere else. The cache includes:
-// 1. Document score. It's defined in and passed from DocumentProto.score.
+// 1. Corpus Id.
+// 2. Document score. It's defined in and passed from DocumentProto.score.
// Positive values are required.
-// 2. Document creation timestamp. Unix timestamp of when the document is
+// 3. Document creation timestamp. Unix timestamp of when the document is
// created and inserted into Icing.
+// 4. Document length in number of tokens.
class DocumentAssociatedScoreData {
public:
- explicit DocumentAssociatedScoreData(int document_score,
- int64_t creation_timestamp_ms)
- : document_score_(document_score),
- creation_timestamp_ms_(creation_timestamp_ms) {}
+ explicit DocumentAssociatedScoreData(CorpusId corpus_id, int document_score,
+ int64_t creation_timestamp_ms,
+ int length_in_tokens = 0)
+ : creation_timestamp_ms_(creation_timestamp_ms),
+ corpus_id_(corpus_id),
+ document_score_(document_score),
+ length_in_tokens_(length_in_tokens) {}
bool operator==(const DocumentAssociatedScoreData& other) const {
return document_score_ == other.document_score() &&
- creation_timestamp_ms_ == other.creation_timestamp_ms();
+ creation_timestamp_ms_ == other.creation_timestamp_ms() &&
+ length_in_tokens_ == other.length_in_tokens() &&
+ corpus_id_ == other.corpus_id();
}
+ CorpusId corpus_id() const { return corpus_id_; }
+
int document_score() const { return document_score_; }
int64_t creation_timestamp_ms() const { return creation_timestamp_ms_; }
+ int length_in_tokens() const { return length_in_tokens_; }
+
private:
- int document_score_;
int64_t creation_timestamp_ms_;
+ CorpusId corpus_id_;
+ int document_score_;
+ int length_in_tokens_;
} __attribute__((packed));
-static_assert(sizeof(DocumentAssociatedScoreData) == 12,
- "Size of DocumentAssociatedScoreData should be 12");
+static_assert(sizeof(DocumentAssociatedScoreData) == 20,
+ "Size of DocumentAssociatedScoreData should be 20");
static_assert(icing_is_packed_pod<DocumentAssociatedScoreData>::value,
"go/icing-ubsan");
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 6a664a3..72bf736 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -37,18 +37,20 @@
#include "icing/proto/document_wrapper.pb.h"
#include "icing/proto/logging.pb.h"
#include "icing/schema/schema-store.h"
+#include "icing/store/corpus-associated-scoring-data.h"
#include "icing/store/corpus-id.h"
#include "icing/store/document-associated-score-data.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
-#include "icing/store/enable-bm25f.h"
#include "icing/store/key-mapper.h"
#include "icing/store/namespace-id.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
#include "icing/util/data-loss.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
namespace icing {
namespace lib {
@@ -61,6 +63,7 @@ constexpr char kDocumentLogFilename[] = "document_log";
constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
constexpr char kScoreCacheFilename[] = "score_cache";
+constexpr char kCorpusScoreCache[] = "corpus_score_cache";
constexpr char kFilterCacheFilename[] = "filter_cache";
constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
constexpr char kUsageStoreDirectoryName[] = "usage_store";
@@ -122,6 +125,10 @@ std::string MakeScoreCacheFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
}
+std::string MakeCorpusScoreCache(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
+}
+
std::string MakeFilterCacheFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
}
@@ -195,8 +202,16 @@ DocumentStore::DocumentStore(const Filesystem* filesystem,
document_validator_(schema_store) {}
libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
- const DocumentProto& document, NativePutDocumentStats* put_document_stats) {
- return Put(DocumentProto(document), put_document_stats);
+ const DocumentProto& document, int32_t num_tokens,
+ NativePutDocumentStats* put_document_stats) {
+ return Put(DocumentProto(document), num_tokens, put_document_stats);
+}
+
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
+ DocumentProto&& document, int32_t num_tokens,
+ NativePutDocumentStats* put_document_stats) {
+ document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
+ return InternalPut(document, put_document_stats);
}
DocumentStore::~DocumentStore() {
@@ -366,12 +381,15 @@ libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() {
usage_store_,
UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
- if (enableBm25f()) {
- ICING_ASSIGN_OR_RETURN(
- corpus_mapper_, KeyMapper<CorpusId>::Create(
- *filesystem_, MakeCorpusMapperFilename(base_dir_),
- kCorpusMapperMaxSize));
- }
+ ICING_ASSIGN_OR_RETURN(corpus_mapper_,
+ KeyMapper<CorpusId>::Create(
+ *filesystem_, MakeCorpusMapperFilename(base_dir_),
+ kCorpusMapperMaxSize));
+
+ ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
+ FileBackedVector<CorpusAssociatedScoreData>::Create(
+ *filesystem_, MakeCorpusScoreCache(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
// Ensure the usage store is the correct size.
ICING_RETURN_IF_ERROR(
@@ -392,9 +410,8 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
ICING_RETURN_IF_ERROR(ResetFilterCache());
ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
- if (enableBm25f()) {
- ICING_RETURN_IF_ERROR(ResetCorpusMapper());
- }
+ ICING_RETURN_IF_ERROR(ResetCorpusMapper());
+ ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
// Creates a new UsageStore instance. Note that we don't reset the data in
// usage store here because we're not able to regenerate the usage scores.
@@ -506,12 +523,6 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
ICING_RETURN_IF_ERROR(
document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
- ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
- new_document_id,
- DocumentAssociatedScoreData(
- document_wrapper.document().score(),
- document_wrapper.document().creation_timestamp_ms())));
-
SchemaTypeId schema_type_id;
auto schema_type_id_or =
schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
@@ -536,13 +547,30 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
namespace_mapper_->num_keys()));
- if (enableBm25f()) {
- // Update corpus maps
- std::string corpus =
- MakeFingerprint(document_wrapper.document().namespace_(),
- document_wrapper.document().schema());
- corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys());
- }
+ // Update corpus maps
+ std::string corpus =
+ MakeFingerprint(document_wrapper.document().namespace_(),
+ document_wrapper.document().schema());
+ ICING_ASSIGN_OR_RETURN(
+ CorpusId corpusId,
+ corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
+
+ ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
+ GetCorpusAssociatedScoreDataToUpdate(corpusId));
+ scoring_data.AddDocument(
+ document_wrapper.document().internal_fields().length_in_tokens());
+
+ ICING_RETURN_IF_ERROR(
+ UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
+
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ new_document_id,
+ DocumentAssociatedScoreData(
+ corpusId, document_wrapper.document().score(),
+ document_wrapper.document().creation_timestamp_ms(),
+ document_wrapper.document()
+ .internal_fields()
+ .length_in_tokens())));
int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
document_wrapper.document().creation_timestamp_ms(),
@@ -638,6 +666,18 @@ libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
return libtextclassifier3::Status::OK;
}
+libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ corpus_score_cache_.reset();
+ ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
+ *filesystem_, MakeCorpusScoreCache(base_dir_)));
+ ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
+ FileBackedVector<CorpusAssociatedScoreData>::Create(
+ *filesystem_, MakeCorpusScoreCache(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+ return libtextclassifier3::Status::OK;
+}
+
libtextclassifier3::Status DocumentStore::ResetFilterCache() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
filter_cache_.reset();
@@ -671,23 +711,21 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
}
libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
- if (enableBm25f()) {
- // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
- corpus_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
- // that can support error logging.
- libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete(
- *filesystem_, MakeCorpusMapperFilename(base_dir_));
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete old corpus_id mapper";
- return status;
- }
- ICING_ASSIGN_OR_RETURN(
- corpus_mapper_, KeyMapper<CorpusId>::Create(
- *filesystem_, MakeCorpusMapperFilename(base_dir_),
- kCorpusMapperMaxSize));
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ corpus_mapper_.reset();
+ // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete(
+ *filesystem_, MakeCorpusMapperFilename(base_dir_));
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete old corpus_id mapper";
+ return status;
}
+ ICING_ASSIGN_OR_RETURN(corpus_mapper_,
+ KeyMapper<CorpusId>::Create(
+ *filesystem_, MakeCorpusMapperFilename(base_dir_),
+ kCorpusMapperMaxSize));
return libtextclassifier3::Status::OK;
}
@@ -738,16 +776,26 @@ libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
Crc32 namespace_mapper_checksum = namespace_mapper_->ComputeChecksum();
+ Crc32 corpus_mapper_checksum = corpus_mapper_->ComputeChecksum();
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = corpus_score_cache_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(WARNING) << checksum_or.status().error_message()
+ << "Failed to compute checksum of score cache";
+ return checksum_or.status();
+ }
+ Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
+
total_checksum.Append(std::to_string(document_log_checksum.Get()));
total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
total_checksum.Append(std::to_string(score_cache_checksum.Get()));
total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
- if (enableBm25f()) {
- Crc32 corpus_mapper_checksum = corpus_mapper_->ComputeChecksum();
- total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
- }
+ total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
+ total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
return total_checksum;
}
@@ -779,8 +827,8 @@ libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
- DocumentProto&& document, NativePutDocumentStats* put_document_stats) {
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
+ DocumentProto& document, NativePutDocumentStats* put_document_stats) {
std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
@@ -793,6 +841,7 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
std::string uri = document.uri();
std::string schema = document.schema();
int document_score = document.score();
+ int32_t length_in_tokens = document.internal_fields().length_in_tokens();
int64_t creation_timestamp_ms = document.creation_timestamp_ms();
// Sets the creation timestamp if caller hasn't specified.
@@ -829,20 +878,28 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
MakeFingerprint(name_space, uri), new_document_id));
ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
- ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
- new_document_id,
- DocumentAssociatedScoreData(document_score, creation_timestamp_ms)));
-
// Update namespace maps
ICING_ASSIGN_OR_RETURN(
NamespaceId namespace_id,
namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
- if (enableBm25f()) {
- // Update corpus maps
- ICING_RETURN_IF_ERROR(corpus_mapper_->GetOrPut(
- MakeFingerprint(name_space, schema), corpus_mapper_->num_keys()));
- }
+ // Update corpus maps
+ ICING_ASSIGN_OR_RETURN(
+ CorpusId corpusId,
+ corpus_mapper_->GetOrPut(MakeFingerprint(name_space, schema),
+ corpus_mapper_->num_keys()));
+
+ ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
+ GetCorpusAssociatedScoreDataToUpdate(corpusId));
+ scoring_data.AddDocument(length_in_tokens);
+
+ ICING_RETURN_IF_ERROR(
+ UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
+
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ new_document_id,
+ DocumentAssociatedScoreData(corpusId, document_score,
+ creation_timestamp_ms, length_in_tokens)));
ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
schema_store_->GetSchemaTypeId(schema));
@@ -876,7 +933,8 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
}
libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
- const std::string_view name_space, const std::string_view uri) const {
+ const std::string_view name_space, const std::string_view uri,
+ bool clear_internal_fields) const {
// TODO(b/147231617): Make a better way to replace the error message in an
// existing Status.
auto document_id_or = GetDocumentId(name_space, uri);
@@ -903,7 +961,7 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
}
libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
- DocumentId document_id) const {
+ DocumentId document_id, bool clear_internal_fields) const {
ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
DoesDocumentExistAndGetFileOffset(document_id));
@@ -917,6 +975,9 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
}
DocumentWrapper document_wrapper =
std::move(document_wrapper_or).ValueOrDie();
+ if (clear_internal_fields) {
+ document_wrapper.mutable_document()->clear_internal_fields();
+ }
return std::move(*document_wrapper.mutable_document());
}
@@ -1088,10 +1149,7 @@ libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
const std::string_view name_space, const std::string_view schema) const {
- if (enableBm25f()) {
- return corpus_mapper_->Get(MakeFingerprint(name_space, schema));
- }
- return absl_ports::NotFoundError("corpus_mapper disabled");
+ return corpus_mapper_->Get(MakeFingerprint(name_space, schema));
}
libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
@@ -1112,6 +1170,34 @@ DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
return document_associated_score_data;
}
+libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
+DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
+ auto score_data_or = corpus_score_cache_->Get(corpus_id);
+ if (!score_data_or.ok()) {
+ return score_data_or.status();
+ }
+
+ CorpusAssociatedScoreData corpus_associated_score_data =
+ *std::move(score_data_or).ValueOrDie();
+ return corpus_associated_score_data;
+}
+
+libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
+DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
+ auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
+ if (corpus_scoring_data_or.ok()) {
+ return std::move(corpus_scoring_data_or).ValueOrDie();
+ }
+ CorpusAssociatedScoreData scoringData;
+ // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
+ // corpus_score_cache_ for the first time.
+ if (corpus_scoring_data_or.status().CanonicalCode() ==
+ libtextclassifier3::StatusCode::OUT_OF_RANGE) {
+ return scoringData;
+ }
+ return corpus_scoring_data_or.status();
+}
+
libtextclassifier3::StatusOr<DocumentFilterData>
DocumentStore::GetDocumentFilterData(DocumentId document_id) const {
auto filter_data_or = filter_cache_->Get(document_id);
@@ -1308,10 +1394,8 @@ libtextclassifier3::Status DocumentStore::PersistToDisk() {
ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
-
- if (enableBm25f()) {
- ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
- }
+ ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
// Update the combined checksum and write to header file.
ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
@@ -1333,16 +1417,16 @@ libtextclassifier3::StatusOr<int64_t> DocumentStore::GetDiskUsage() const {
filter_cache_->GetDiskUsage());
ICING_ASSIGN_OR_RETURN(const int64_t namespace_mapper_disk_usage,
namespace_mapper_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(const int64_t corpus_mapper_disk_usage,
+ corpus_mapper_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_disk_usage,
+ corpus_score_cache_->GetDiskUsage());
int64_t disk_usage = document_log_disk_usage +
document_key_mapper_disk_usage +
document_id_mapper_disk_usage + score_cache_disk_usage +
- filter_cache_disk_usage + namespace_mapper_disk_usage;
- if (enableBm25f()) {
- ICING_ASSIGN_OR_RETURN(const int64_t corpus_mapper_disk_usage,
- corpus_mapper_->GetDiskUsage());
- disk_usage += corpus_mapper_disk_usage;
- }
+ filter_cache_disk_usage + namespace_mapper_disk_usage +
+ corpus_mapper_disk_usage + corpus_score_cache_disk_usage;
return disk_usage;
}
@@ -1493,7 +1577,7 @@ libtextclassifier3::Status DocumentStore::Optimize() {
}
libtextclassifier3::Status DocumentStore::OptimizeInto(
- const std::string& new_directory) {
+ const std::string& new_directory, const LanguageSegmenter* lang_segmenter) {
// Validates directory
if (new_directory == base_dir_) {
return absl_ports::InvalidArgumentError(
@@ -1509,7 +1593,7 @@ libtextclassifier3::Status DocumentStore::OptimizeInto(
// Writes all valid docs into new document store (new directory)
int size = document_id_mapper_->num_elements();
for (DocumentId document_id = 0; document_id < size; document_id++) {
- auto document_or = Get(document_id);
+ auto document_or = Get(document_id, /*clear_internal_fields=*/false);
if (absl_ports::IsNotFound(document_or.status())) {
// Skip nonexistent documents
continue;
@@ -1523,9 +1607,26 @@ libtextclassifier3::Status DocumentStore::OptimizeInto(
// Guaranteed to have a document now.
DocumentProto document_to_keep = document_or.ValueOrDie();
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
- // that can support error logging.
- auto new_document_id_or = new_doc_store->Put(std::move(document_to_keep));
+
+ libtextclassifier3::StatusOr<DocumentId> new_document_id_or;
+ if (document_to_keep.internal_fields().length_in_tokens() == 0) {
+ auto tokenized_document_or = TokenizedDocument::Create(
+ schema_store_, lang_segmenter, document_to_keep);
+ if (!tokenized_document_or.ok()) {
+ return absl_ports::Annotate(
+ tokenized_document_or.status(),
+ IcingStringUtil::StringPrintf(
+ "Failed to tokenize Document for DocumentId %d", document_id));
+ }
+ TokenizedDocument tokenized_document(
+ std::move(tokenized_document_or).ValueOrDie());
+ new_document_id_or =
+ new_doc_store->Put(document_to_keep, tokenized_document.num_tokens());
+ } else {
+ // TODO(b/144458732): Implement a more robust version of
+ // TC_ASSIGN_OR_RETURN that can support error logging.
+ new_document_id_or = new_doc_store->InternalPut(document_to_keep);
+ }
if (!new_document_id_or.ok()) {
ICING_LOG(ERROR) << new_document_id_or.status().error_message()
<< "Failed to write into new document store";
@@ -1577,26 +1678,39 @@ DocumentStore::GetOptimizeInfo() const {
score_cache_->GetElementsFileSize());
ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
filter_cache_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
+ corpus_score_cache_->GetElementsFileSize());
+
+ // Usage store might be sparse, but we'll still use file size for more
+ // accurate counting.
+ ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
+ usage_store_->GetElementsFileSize());
// We use a combined disk usage and file size for the KeyMapper because it's
// backed by a trie, which has some sparse property bitmaps.
ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
document_key_mapper_->GetElementsSize());
- // We don't include the namespace mapper because it's not clear if we could
- // recover any space even if Optimize were called. Deleting 100s of documents
- // could still leave a few documents of a namespace, and then there would be
- // no change.
+ // We don't include the namespace_mapper or the corpus_mapper because it's not
+ // clear if we could recover any space even if Optimize were called. Deleting
+ // 100s of documents could still leave a few documents of a namespace, and
+ // then there would be no change.
int64_t total_size = document_log_file_size + document_key_mapper_size +
document_id_mapper_file_size + score_cache_file_size +
- filter_cache_file_size;
+ filter_cache_file_size + corpus_score_cache_file_size +
+ usage_store_file_size;
optimize_info.estimated_optimizable_bytes =
total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
return optimize_info;
}
+libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
+ CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
+ return corpus_score_cache_->Set(corpus_id, score_data);
+}
+
libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
return score_cache_->Set(document_id, score_data);
@@ -1617,8 +1731,10 @@ libtextclassifier3::Status DocumentStore::ClearDerivedData(
// Resets the score cache entry
ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
- document_id, DocumentAssociatedScoreData(/*document_score=*/-1,
- /*creation_timestamp_ms=*/-1)));
+ document_id, DocumentAssociatedScoreData(kInvalidCorpusId,
+ /*document_score=*/-1,
+ /*creation_timestamp_ms=*/-1,
+ /*length_in_tokens=*/0)));
// Resets the filter cache entry
ICING_RETURN_IF_ERROR(UpdateFilterCache(
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 78590a5..b2908f0 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -30,6 +30,7 @@
#include "icing/proto/document_wrapper.pb.h"
#include "icing/proto/logging.pb.h"
#include "icing/schema/schema-store.h"
+#include "icing/store/corpus-associated-scoring-data.h"
#include "icing/store/corpus-id.h"
#include "icing/store/document-associated-score-data.h"
#include "icing/store/document-filter-data.h"
@@ -37,6 +38,7 @@
#include "icing/store/key-mapper.h"
#include "icing/store/namespace-id.h"
#include "icing/store/usage-store.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
#include "icing/util/data-loss.h"
@@ -149,23 +151,27 @@ class DocumentStore {
// exist in schema
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<DocumentId> Put(
- const DocumentProto& document,
+ const DocumentProto& document, int32_t num_tokens = 0,
NativePutDocumentStats* put_document_stats = nullptr);
libtextclassifier3::StatusOr<DocumentId> Put(
- DocumentProto&& document,
+ DocumentProto&& document, int32_t num_tokens = 0,
NativePutDocumentStats* put_document_stats = nullptr);
// Finds and returns the document identified by the given key (namespace +
- // uri)
+ // uri). If 'clear_internal_fields' is true, document level data that's
+ // generated internally by DocumentStore is cleared.
//
// Returns:
// The document found on success
// NOT_FOUND if the key doesn't exist or document has been deleted
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<DocumentProto> Get(std::string_view name_space,
- std::string_view uri) const;
+ libtextclassifier3::StatusOr<DocumentProto> Get(
+ std::string_view name_space, std::string_view uri,
+ bool clear_internal_fields = true) const;
- // Finds and returns the document identified by the given document id
+ // Finds and returns the document identified by the given document id. If
+ // 'clear_internal_fields' is true, document level data that's generated
+ // internally by DocumentStore is cleared.
//
// Returns:
// The document found on success
@@ -173,7 +179,8 @@ class DocumentStore {
// maximum value
// NOT_FOUND if the document doesn't exist or has been deleted
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<DocumentProto> Get(DocumentId document_id) const;
+ libtextclassifier3::StatusOr<DocumentProto> Get(
+ DocumentId document_id, bool clear_internal_fields = true) const;
// Returns all namespaces which have at least 1 active document (not deleted
// or expired). Order of namespaces is undefined.
@@ -256,6 +263,20 @@ class DocumentStore {
libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const;
+ // Returns the CorpusAssociatedScoreData of the corpus specified by the
+ // corpus_id.
+ //
+ // NOTE: This does not check if the corpus exists and will return the
+ // CorpusAssociatedScoreData of the corpus even if all documents belonging to
+ // that corpus have been deleted.
+ //
+ // Returns:
+ // CorpusAssociatedScoreData on success
+ // OUT_OF_RANGE if corpus_id is negative or exceeds previously seen
+ // CorpusIds
+ libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
+ GetCorpusAssociatedScoreData(CorpusId corpus_id) const;
+
// Returns the DocumentFilterData of the document specified by the DocumentId.
//
// NOTE: This does not check if the document exists and will return the
@@ -394,7 +415,9 @@ class DocumentStore {
// OK on success
// INVALID_ARGUMENT if new_directory is same as current base directory
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status OptimizeInto(const std::string& new_directory);
+ libtextclassifier3::Status OptimizeInto(
+ const std::string& new_directory,
+ const LanguageSegmenter* lang_segmenter);
// Calculates status for a potential Optimize call. Includes how many docs
// there are vs how many would be optimized away. And also includes an
@@ -441,8 +464,10 @@ class DocumentStore {
// A cache of document associated scores. The ground truth of the scores is
// DocumentProto stored in document_log_. This cache contains:
+ // - CorpusId
// - Document score
// - Document creation timestamp in seconds
+ // - Document length in number of tokens
std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;
// A cache of data, indexed by DocumentId, used to filter documents. Currently
@@ -452,6 +477,13 @@ class DocumentStore {
// - Expiration timestamp in seconds
std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;
+ // A cache of corpus associated scores. The ground truth of the scores is
+ // DocumentProto stored in document_log_. This cache contains:
+ // - Number of documents belonging to the corpus score
+ // - The sum of the documents' lengths, in number of tokens.
+ std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>>
+ corpus_score_cache_;
+
// Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
// id when the first document belonging to that namespace is added to the
// DocumentStore. Namespaces may be removed from the mapper during compaction.
@@ -516,6 +548,12 @@ class DocumentStore {
// Returns OK or any IO errors.
libtextclassifier3::Status ResetDocumentAssociatedScoreCache();
+ // Resets the unique_ptr to the corpus_score_cache, deletes the underlying
+ // file, and re-creates a new instance of the corpus_score_cache.
+ //
+ // Returns OK or any IO errors.
+ libtextclassifier3::Status ResetCorpusAssociatedScoreCache();
+
// Resets the unique_ptr to the filter_cache, deletes the underlying file, and
// re-creates a new instance of the filter_cache.
//
@@ -546,6 +584,10 @@ class DocumentStore {
// INTERNAL on I/O error
libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
+ libtextclassifier3::StatusOr<DocumentId> InternalPut(
+ DocumentProto& document,
+ NativePutDocumentStats* put_document_stats = nullptr);
+
// Helper function to do batch deletes. Documents with the given
// "namespace_id" and "schema_type_id" will be deleted. If callers don't need
// to specify the namespace or schema type, pass in kInvalidNamespaceId or
@@ -597,6 +639,21 @@ class DocumentStore {
libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
std::string_view name_space, std::string_view uri) const;
+ // Returns the CorpusAssociatedScoreData of the corpus specified by the
+ // corpus_id.
+ //
+ // If the corpus_id has never been seen before, it returns a
+ // CorpusAssociatedScoreData with properties set to default values.
+ //
+ // NOTE: This does not check if the corpus exists and will return the
+ // CorpusAssociatedScoreData of the corpus even if all documents belonging to
+ // that corpus have been deleted.
+ //
+ // Returns:
+ // CorpusAssociatedScoreData on success
+ libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
+ GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
+
// Helper method to validate the document id and return the file offset of the
// associated document in document_log_.
//
@@ -617,6 +674,10 @@ class DocumentStore {
libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
DocumentId document_id, const DocumentAssociatedScoreData& score_data);
+ // Updates the entry in the corpus score cache for corpus_id.
+ libtextclassifier3::Status UpdateCorpusAssociatedScoreCache(
+ CorpusId corpus_id, const CorpusAssociatedScoreData& score_data);
+
// Updates the entry in the filter cache for document_id.
libtextclassifier3::Status UpdateFilterCache(
DocumentId document_id, const DocumentFilterData& filter_data);
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 29bf8bb..7754373 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -27,20 +27,25 @@
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
#include "icing/file/mock-filesystem.h"
+#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/schema/schema-store.h"
+#include "icing/store/corpus-associated-scoring-data.h"
+#include "icing/store/corpus-id.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
-#include "icing/store/enable-bm25f.h"
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
#include "icing/testing/platform.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/util/crc32.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -101,7 +106,19 @@ class DocumentStoreTest : public ::testing::Test {
}
void SetUp() override {
- setEnableBm25f(true);
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+
filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
@@ -133,6 +150,11 @@ class DocumentStoreTest : public ::testing::Test {
schema_store_,
SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
}
void TearDown() override {
@@ -147,6 +169,7 @@ class DocumentStoreTest : public ::testing::Test {
DocumentProto test_document1_;
DocumentProto test_document2_;
std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
// Document1 values
const int document1_score_ = 1;
@@ -1184,9 +1207,10 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
filesystem_.GetFileSize(original_document_log.c_str());
// Optimizing into the same directory is not allowed
- EXPECT_THAT(doc_store->OptimizeInto(document_store_dir_),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
- HasSubstr("directory is the same")));
+ EXPECT_THAT(
+ doc_store->OptimizeInto(document_store_dir_, lang_segmenter_.get()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("directory is the same")));
std::string optimized_dir = document_store_dir_ + "_optimize";
std::string optimized_document_log = optimized_dir + "/document_log";
@@ -1195,7 +1219,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
// deleted
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ ICING_ASSERT_OK(
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
int64_t optimized_size1 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_EQ(original_size, optimized_size1);
@@ -1205,7 +1230,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
ICING_ASSERT_OK(doc_store->Delete("namespace", "uri1"));
- ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ ICING_ASSERT_OK(
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
int64_t optimized_size2 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_THAT(original_size, Gt(optimized_size2));
@@ -1218,7 +1244,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
// expired
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ ICING_ASSERT_OK(
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
int64_t optimized_size3 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_THAT(optimized_size2, Gt(optimized_size3));
@@ -1235,14 +1262,32 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) {
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- ICING_ASSERT_OK_AND_ASSIGN(document_id1,
- doc_store->Put(DocumentProto(test_document1_)));
- ICING_ASSERT_OK_AND_ASSIGN(document_id2,
- doc_store->Put(DocumentProto(test_document2_)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id1,
+ doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id2,
+ doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4));
EXPECT_THAT(doc_store->Get(document_id1),
IsOkAndHolds(EqualsProto(test_document1_)));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
+ // Checks derived score cache
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/8)));
+
+ // Delete document 1
EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -1281,9 +1326,14 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) {
/*namespace_id=*/0,
/*schema_type_id=*/0, document2_expiration_timestamp_)));
// Checks derived score cache
- EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
- IsOkAndHolds(DocumentAssociatedScoreData(
- document2_score_, document2_creation_timestamp_)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/4)));
}
TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) {
@@ -1297,14 +1347,31 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) {
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- ICING_ASSERT_OK_AND_ASSIGN(document_id1,
- doc_store->Put(DocumentProto(test_document1_)));
- ICING_ASSERT_OK_AND_ASSIGN(document_id2,
- doc_store->Put(DocumentProto(test_document2_)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id1,
+ doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id2,
+ doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4));
EXPECT_THAT(doc_store->Get(document_id1),
IsOkAndHolds(EqualsProto(test_document1_)));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
+ // Checks derived score cache
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/8)));
+ // Delete document 1
EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -1328,6 +1395,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) {
IsOk());
// Successfully recover from a corrupt derived file issue.
+ // NOTE: this doesn't trigger RegenerateDerivedFiles.
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1345,10 +1413,16 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) {
IsOkAndHolds(DocumentFilterData(
/*namespace_id=*/0,
/*schema_type_id=*/0, document2_expiration_timestamp_)));
- // Checks derived score cache
- EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
- IsOkAndHolds(DocumentAssociatedScoreData(
- document2_score_, document2_creation_timestamp_)));
+ // Checks derived score cache - note that they aren't regenerated from
+ // scratch.
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/8)));
}
TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) {
@@ -1362,14 +1436,30 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) {
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- ICING_ASSERT_OK_AND_ASSIGN(document_id1,
- doc_store->Put(DocumentProto(test_document1_)));
- ICING_ASSERT_OK_AND_ASSIGN(document_id2,
- doc_store->Put(DocumentProto(test_document2_)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id1,
+ doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id2,
+ doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4));
EXPECT_THAT(doc_store->Get(document_id1),
IsOkAndHolds(EqualsProto(test_document1_)));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
+ // Checks derived score cache
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/8)));
EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -1407,9 +1497,14 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) {
/*namespace_id=*/0,
/*schema_type_id=*/0, document2_expiration_timestamp_)));
// Checks derived score cache
- EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
- IsOkAndHolds(DocumentAssociatedScoreData(
- document2_score_, document2_creation_timestamp_)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/4)));
}
TEST_F(DocumentStoreTest, GetDiskUsage) {
@@ -1544,28 +1639,6 @@ TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, GetCorpusIdReturnsNotFoundWhenFeatureIsDisabled) {
- setEnableBm25f(false);
- ICING_ASSERT_OK_AND_ASSIGN(
- DocumentStore::CreateResult create_result,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
- std::unique_ptr<DocumentStore> doc_store =
- std::move(create_result.document_store);
-
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace", "1").SetSchema("email").Build();
- DocumentProto document2 =
- DocumentBuilder().SetKey("namespace", "2").SetSchema("email").Build();
-
- ICING_ASSERT_OK(doc_store->Put(document1));
- ICING_ASSERT_OK(doc_store->Put(document2));
-
- EXPECT_THAT(doc_store->GetCorpusId("namespace", "email"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
- HasSubstr("corpus_mapper disabled")));
-}
-
TEST_F(DocumentStoreTest, GetCorpusDuplicateCorpusId) {
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
@@ -1582,7 +1655,7 @@ TEST_F(DocumentStoreTest, GetCorpusDuplicateCorpusId) {
ICING_ASSERT_OK(doc_store->Put(document1));
ICING_ASSERT_OK(doc_store->Put(document2));
- // NamespaceId of 0 since it was the first namespace seen by the DocumentStore
+ // CorpusId of 0 since it was the first namespace seen by the DocumentStore
EXPECT_THAT(doc_store->GetCorpusId("namespace", "email"),
IsOkAndHolds(Eq(0)));
}
@@ -1642,6 +1715,183 @@ TEST_F(DocumentStoreTest, NonexistentCorpusNotFound) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->GetCorpusId("namespace1", "nonexistent_schema"),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "1").SetSchema("email").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "2").SetSchema("email").Build();
+
+ ICING_ASSERT_OK(doc_store->Put(document1, /*num_tokens=*/5));
+ ICING_ASSERT_OK(doc_store->Put(document2, /*num_tokens=*/7));
+
+ // CorpusId of 0 since it was the first namespace seen by the DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/12)));
+ // Only one corpus exists
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreData) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document_corpus1 =
+ DocumentBuilder().SetKey("namespace1", "1").SetSchema("email").Build();
+ DocumentProto document_corpus2 =
+ DocumentBuilder().SetKey("namespace2", "2").SetSchema("email").Build();
+
+ ICING_ASSERT_OK(
+ doc_store->Put(DocumentProto(document_corpus1), /*num_tokens=*/5));
+ ICING_ASSERT_OK(
+ doc_store->Put(DocumentProto(document_corpus2), /*num_tokens=*/7));
+
+ // CorpusId of 0 since it was the first corpus seen by the DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/5)));
+
+ // CorpusId of 1 since it was the second corpus seen by the
+ // DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/7)));
+
+ // DELETE namespace1 - document_corpus1 is deleted.
+ ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace1").status);
+
+ // Corpus score cache doesn't care if the document has been deleted
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/5)));
+}
+
+TEST_F(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .SetScore(document2_score_)
+ .SetCreationTimestampMs(
+ document2_creation_timestamp_) // A random timestamp
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ doc_store->Put(DocumentProto(document1), /*num_tokens=*/5));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ doc_store->Put(DocumentProto(document2), /*num_tokens=*/7));
+
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/5)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/7)));
+}
+
+TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataDifferentCorpus) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "2")
+ .SetSchema("email")
+ .SetScore(document2_score_)
+ .SetCreationTimestampMs(
+ document2_creation_timestamp_) // A random timestamp
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ doc_store->Put(DocumentProto(document1), /*num_tokens=*/5));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ doc_store->Put(DocumentProto(document2), /*num_tokens=*/7));
+
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/5)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/1, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/7)));
+}
+
+TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataOutOfRange) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(/*document_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) {
@@ -1700,12 +1950,13 @@ TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearScoreCache) {
std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- doc_store->Put(test_document1_));
+ doc_store->Put(test_document1_, /*num_tokens=*/4));
EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
IsOkAndHolds(DocumentAssociatedScoreData(
- /*document_score=*/document1_score_,
- /*creation_timestamp_ms=*/document1_creation_timestamp_)));
+ /*corpus_id=*/0, /*document_score=*/document1_score_,
+ /*creation_timestamp_ms=*/document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
// Associated entry of the deleted document is removed.
@@ -1722,12 +1973,14 @@ TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) {
std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- doc_store->Put(test_document1_));
+ doc_store->Put(test_document1_, /*num_tokens=*/4));
EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0,
/*document_score=*/document1_score_,
- /*creation_timestamp_ms=*/document1_creation_timestamp_)));
+ /*creation_timestamp_ms=*/document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
// Associated entry of the deleted document is removed.
@@ -1931,11 +2184,15 @@ TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) {
EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id1),
IsOkAndHolds(DocumentAssociatedScoreData(
- /*document_score=*/0, /*creation_timestamp_ms=*/0)));
+ /*corpus_id=*/0,
+ /*document_score=*/0, /*creation_timestamp_ms=*/0,
+ /*length_in_tokens=*/0)));
EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
IsOkAndHolds(DocumentAssociatedScoreData(
- /*document_score=*/5, /*creation_timestamp_ms=*/0)));
+ /*corpus_id=*/0,
+ /*document_score=*/5, /*creation_timestamp_ms=*/0,
+ /*length_in_tokens=*/0)));
}
TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) {
@@ -2636,7 +2893,8 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) {
std::string optimized_dir = document_store_dir_ + "_optimize";
EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir));
+ ICING_ASSERT_OK(
+ document_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
document_store.reset();
ICING_ASSERT_OK_AND_ASSIGN(
create_result, DocumentStore::Create(&filesystem_, optimized_dir,
@@ -3046,7 +3304,8 @@ TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) {
// Run optimize
std::string optimized_dir = document_store_dir_ + "/optimize_test";
filesystem_.CreateDirectoryRecursively(optimized_dir.c_str());
- ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir));
+ ICING_ASSERT_OK(
+ document_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
// Get optimized document store
ICING_ASSERT_OK_AND_ASSIGN(
@@ -3149,9 +3408,9 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
// the current code is compatible with the format of the v0 scoring_cache,
// then an empty document store should be initialized, but the non-empty
// scoring_cache should be retained.
- // Since the current document-asscoiated-score-data is compatible with the
- // score_cache in testdata/v0/document_store, the document store should be
- // initialized without having to re-generate the derived files.
+ // The current document-asscoiated-score-data has a new field with respect to
+ // the ones stored in testdata/v0, hence the document store's initialization
+ // requires regenerating its derived files.
// Create dst directory
ASSERT_THAT(filesystem_.CreateDirectory(document_store_dir_.c_str()), true);
@@ -3186,9 +3445,10 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
schema_store_.get(), &initializeStats));
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- // Regeneration never happens.
- EXPECT_EQ(initializeStats.document_store_recovery_cause(),
- NativeInitializeStats::NONE);
+ // The store_cache trigger regeneration because its element size is
+ // inconsistent: expected 20 (current new size), actual 12 (as per the v0
+ // score_cache).
+ EXPECT_TRUE(initializeStats.has_document_store_recovery_cause());
}
} // namespace
diff --git a/icing/store/enable-bm25f.h b/icing/store/enable-bm25f.h
deleted file mode 100644
index cee94d1..0000000
--- a/icing/store/enable-bm25f.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (C) 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_STORE_ENABLE_BM25F_H_
-#define ICING_STORE_ENABLE_BM25F_H_
-
-namespace icing {
-namespace lib {
-
-inline bool enable_bm25f_ = false;
-
-inline bool enableBm25f() { return enable_bm25f_; }
-
-// Setter for testing purposes. It should never be called in production code.
-inline void setEnableBm25f(bool enable_bm25f) { enable_bm25f_ = enable_bm25f; }
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_STORE_ENABLE_BM25F_H_
diff --git a/icing/store/usage-store.cc b/icing/store/usage-store.cc
index 7a0af9c..54896dc 100644
--- a/icing/store/usage-store.cc
+++ b/icing/store/usage-store.cc
@@ -214,6 +214,10 @@ libtextclassifier3::StatusOr<Crc32> UsageStore::ComputeChecksum() {
return usage_score_cache_->ComputeChecksum();
}
+libtextclassifier3::StatusOr<int64_t> UsageStore::GetElementsFileSize() const {
+ return usage_score_cache_->GetElementsFileSize();
+}
+
libtextclassifier3::Status UsageStore::TruncateTo(DocumentId num_documents) {
if (num_documents >= usage_score_cache_->num_elements()) {
// No need to truncate
diff --git a/icing/store/usage-store.h b/icing/store/usage-store.h
index 0a622a0..b7de970 100644
--- a/icing/store/usage-store.h
+++ b/icing/store/usage-store.h
@@ -148,6 +148,15 @@ class UsageStore {
// INTERNAL_ERROR if the internal state is inconsistent
libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+ // Returns the file size of the all the elements held in the UsageStore. File
+ // size is in bytes. This excludes the size of any internal metadata, e.g. any
+ // internal headers.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
// Resizes the storage so that only the usage scores of and before
// last_document_id are stored.
//
diff --git a/icing/store/usage-store_test.cc b/icing/store/usage-store_test.cc
index f7fa778..220c226 100644
--- a/icing/store/usage-store_test.cc
+++ b/icing/store/usage-store_test.cc
@@ -24,6 +24,7 @@ namespace lib {
namespace {
using ::testing::Eq;
+using ::testing::Gt;
using ::testing::Not;
class UsageStoreTest : public testing::Test {
@@ -560,6 +561,22 @@ TEST_F(UsageStoreTest, StoreShouldBeResetOnHeaderChecksumMismatch) {
IsOkAndHolds(UsageStore::UsageScores()));
}
+TEST_F(UsageStoreTest, GetElementsFileSize) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_file_size,
+ usage_store->GetElementsFileSize());
+ EXPECT_THAT(empty_file_size, Eq(0));
+
+ UsageReport usage_report = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1);
+ usage_store->AddUsageReport(usage_report, /*document_id=*/1);
+
+ EXPECT_THAT(usage_store->GetElementsFileSize(),
+ IsOkAndHolds(Gt(empty_file_size)));
+}
+
} // namespace
} // namespace lib