aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build.gradle25
-rw-r--r--icing/absl_ports/annotate.cc2
-rw-r--r--icing/file/file-backed-proto-log.h98
-rw-r--r--icing/file/file-backed-proto-log_test.cc185
-rw-r--r--icing/file/file-backed-vector.h2
-rw-r--r--icing/icing-search-engine.cc138
-rw-r--r--icing/icing-search-engine.h18
-rw-r--r--icing/icing-search-engine_test.cc560
-rw-r--r--icing/index/index-processor_benchmark.cc13
-rw-r--r--icing/index/index-processor_test.cc7
-rw-r--r--icing/index/index.cc16
-rw-r--r--icing/index/index_test.cc7
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-filter.cc10
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-filter.h4
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-filter_test.cc27
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.cc (renamed from icing/index/iterator/doc-hit-info-iterator-term.cc)17
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.h (renamed from icing/index/iterator/doc-hit-info-iterator-term.h)43
-rw-r--r--icing/index/lite/lite-index.cc50
-rw-r--r--icing/index/lite/lite-index.h126
-rw-r--r--icing/index/lite/term-id-hit-pair.h80
-rw-r--r--icing/index/main/doc-hit-info-iterator-term-main.cc166
-rw-r--r--icing/index/main/doc-hit-info-iterator-term-main.h114
-rw-r--r--icing/index/main/flash-index-storage-header.h122
-rw-r--r--icing/index/main/flash-index-storage.cc511
-rw-r--r--icing/index/main/flash-index-storage.h275
-rw-r--r--icing/index/main/flash-index-storage_test.cc540
-rw-r--r--icing/index/main/index-block.cc7
-rw-r--r--icing/index/main/index-block.h9
-rw-r--r--icing/index/main/main-index-merger.cc225
-rw-r--r--icing/index/main/main-index-merger.h49
-rw-r--r--icing/index/main/main-index-merger_test.cc367
-rw-r--r--icing/index/main/main-index.cc497
-rw-r--r--icing/index/main/main-index.h235
-rw-r--r--icing/index/main/main-index_test.cc536
-rw-r--r--icing/index/main/posting-list-accessor.cc194
-rw-r--r--icing/index/main/posting-list-accessor.h168
-rw-r--r--icing/index/main/posting-list-accessor_test.cc384
-rw-r--r--icing/index/main/posting-list-identifier.cc25
-rw-r--r--icing/index/main/posting-list-identifier.h116
-rw-r--r--icing/jni/icing-search-engine-jni.cc18
-rw-r--r--icing/legacy/core/icing-string-util.cc8
-rw-r--r--icing/legacy/core/icing-string-util.h4
-rw-r--r--icing/legacy/index/icing-dynamic-trie.cc270
-rw-r--r--icing/legacy/index/icing-dynamic-trie.h28
-rw-r--r--icing/legacy/index/icing-dynamic-trie_test.cc216
-rw-r--r--icing/legacy/index/icing-mock-filesystem.h79
-rw-r--r--icing/query/query-processor_benchmark.cc13
-rw-r--r--icing/result/result-retriever_test.cc7
-rw-r--r--icing/result/snippet-retriever_test.cc7
-rw-r--r--icing/store/document-filter-data.h1
-rw-r--r--icing/store/document-store.cc301
-rw-r--r--icing/store/document-store.h103
-rw-r--r--icing/store/document-store_test.cc338
-rw-r--r--icing/store/key-mapper.h8
-rw-r--r--icing/store/namespace-id.h1
-rw-r--r--icing/store/usage-store.cc193
-rw-r--r--icing/store/usage-store.h160
-rw-r--r--icing/store/usage-store_test.cc389
-rw-r--r--icing/tokenization/icu/icu-language-segmenter-factory.cc1
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc78
-rw-r--r--icing/tokenization/language-segmenter-factory.h11
-rw-r--r--icing/tokenization/language-segmenter-iterator_test.cc54
-rw-r--r--icing/tokenization/language-segmenter_benchmark.cc10
-rw-r--r--icing/tokenization/plain-tokenizer_test.cc55
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc49
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc2
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc83
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc173
-rw-r--r--icing/tokenization/simple/space-language-segmenter_test.cc43
-rw-r--r--icing/util/character-iterator.cc127
-rw-r--r--icing/util/character-iterator.h70
-rw-r--r--icing/util/i18n-utils.cc17
-rw-r--r--icing/util/i18n-utils.h7
-rw-r--r--java/src/com/google/android/icing/IcingSearchEngine.java23
-rw-r--r--java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java52
-rw-r--r--proto/icing/proto/status.proto8
-rw-r--r--proto/icing/proto/usage.proto53
77 files changed, 8363 insertions, 665 deletions
diff --git a/build.gradle b/build.gradle
index 3901078..6d13dc2 100644
--- a/build.gradle
+++ b/build.gradle
@@ -21,7 +21,7 @@ import static androidx.build.dependencies.DependenciesKt.*
buildscript {
dependencies {
- classpath('gradle.plugin.com.google.protobuf:protobuf-gradle-plugin:0.8.8')
+ classpath('gradle.plugin.com.google.protobuf:protobuf-gradle-plugin:0.8.13')
classpath('org.anarres.jarjar:jarjar-gradle:1.0.1')
}
}
@@ -88,7 +88,11 @@ android.libraryVariants.all { variant ->
def suffix = variantName.capitalize()
def jarjarTask = tasks.create("jarjar${suffix}", JarjarTask) {
destinationName "icing-java-${variantName}-jarjar.jar"
- from 'com.google.protobuf:protobuf-javalite:3.10.0'
+
+
+ dependsOn protoLiteJarWithoutProtoFiles
+ from files(protoLiteJarWithoutProtoFiles.archiveFile.get().getAsFile())
+
from files(variant.javaCompileProvider.get().destinationDir)
dependsOn variant.javaCompileProvider.get()
classRename 'com.google.protobuf.**', 'com.google.android.icing.protobuf.@1'
@@ -101,3 +105,20 @@ android.libraryVariants.all { variant ->
builtBy jarjarTask
}
}
+
+// The proto-lite dependency includes .proto files, which are not used by icing. When apps depend on
+// appsearch as well as proto-lite directly, these files conflict since jarjar only renames the java
+// classes. Remove them here since they are unused.
+tasks.register("protoLiteJarWithoutProtoFiles", Jar){
+ // Get proto lite dependency as a jar file:
+ def jarFile = configurations.detachedConfiguration(
+ dependencies.create('com.google.protobuf:protobuf-javalite:3.10.0')).getSingleFile()
+
+ // Expand the jar and remove any .proto files.
+ from(zipTree(jarFile)) {
+ exclude("**/*.proto")
+ }
+
+ into 'icing-proto-lite-dep-stripped'
+}
+
diff --git a/icing/absl_ports/annotate.cc b/icing/absl_ports/annotate.cc
index d283e13..dfe5566 100644
--- a/icing/absl_ports/annotate.cc
+++ b/icing/absl_ports/annotate.cc
@@ -33,7 +33,7 @@ libtextclassifier3::Status Annotate(const libtextclassifier3::Status& s,
std::string new_msg =
(!s.error_message().empty())
- ? absl_ports::StrCat(s.error_message(), kErrorSeparator, msg)
+ ? absl_ports::StrCat(msg, kErrorSeparator, s.error_message())
: std::string(msg);
return libtextclassifier3::Status(s.CanonicalCode(), new_msg);
}
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 62943b8..95511ac 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -78,6 +78,23 @@
namespace icing {
namespace lib {
+namespace {
+
+bool IsEmptyBuffer(const char* buffer, int size) {
+ return std::all_of(buffer, buffer + size,
+ [](const char byte) { return byte == 0; });
+}
+
+// Helper function to get stored proto size from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
+
+// Helper function to get stored proto magic from the metadata.
+// Metadata format: 8 bits magic + 24 bits size
+uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
+
+} // namespace
+
template <typename ProtoT>
class FileBackedProtoLog {
public:
@@ -206,10 +223,19 @@ class FileBackedProtoLog {
//
// Returns:
// A proto on success
+ // NOT_FOUND if the proto at the given offset has been erased
// OUT_OF_RANGE_ERROR if file_offset exceeds file size
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
+ // Erases the data of a proto located at file_offset from the file.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status EraseProto(int64_t file_offset);
+
// Calculates and returns the disk usage in bytes. Rounds up to the nearest
// block size.
//
@@ -239,7 +265,7 @@ class FileBackedProtoLog {
Iterator(const Filesystem& filesystem, const std::string& file_path,
int64_t initial_offset);
- // Advances to the position of next proto.
+ // Advances to the position of next proto whether it has been erased or not.
//
// Returns:
// OK on success
@@ -716,10 +742,15 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
// Copy out however many bytes it says the proto is
- int stored_size = metadata & 0x00FFFFFF;
+ int stored_size = GetProtoSize(metadata);
ICING_RETURN_IF_ERROR(
mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+
+ if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+ return absl_ports::NotFoundError("The proto data has been erased.");
+ }
+
google::protobuf::io::ArrayInputStream proto_stream(
mmapped_file.mutable_region(), stored_size);
@@ -736,6 +767,62 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
}
template <typename ProtoT>
+libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto(
+ int64_t file_offset) {
+ int64_t file_size = filesystem_->GetFileSize(fd_.get());
+ if (file_offset >= file_size) {
+ // file_size points to the next byte to write at, so subtract one to get the
+ // inclusive, actual size of file.
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Trying to erase data at a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+
+ MemoryMappedFile mmapped_file(
+ *filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
+
+ // Read out the metadata
+ ICING_ASSIGN_OR_RETURN(
+ int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
+ GetProtoSize(metadata)));
+
+ // We need to update the crc checksum if the erased area is before the rewind
+ // position.
+ if (file_offset + sizeof(metadata) < header_->rewind_offset) {
+ // We need to calculate [original string xor 0s].
+ // The xored string is the same as the original string because 0 xor 0 = 0,
+ // 1 xor 0 = 1.
+ const std::string_view xored_str(mmapped_file.region(),
+ mmapped_file.region_size());
+
+ Crc32 crc(header_->log_checksum);
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t new_crc,
+ crc.UpdateWithXor(
+ xored_str,
+ /*full_data_size=*/header_->rewind_offset - sizeof(Header),
+ /*position=*/file_offset + sizeof(metadata) - sizeof(Header)));
+
+ header_->log_checksum = new_crc;
+ header_->header_checksum = header_->CalculateHeaderChecksum();
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+ }
+
+ memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
const {
int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
@@ -781,8 +868,7 @@ libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() {
ICING_ASSIGN_OR_RETURN(
int metadata,
ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
- int proto_size = metadata & 0x00FFFFFF;
- current_offset_ += sizeof(metadata) + proto_size;
+ current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
}
if (current_offset_ < file_size_) {
@@ -829,7 +915,7 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
memcpy(&metadata, mmapped_file->region(), metadata_size);
// Checks magic number
- uint8_t stored_k_proto_magic = metadata >> 24;
+ uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
if (stored_k_proto_magic != kProtoMagic) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
@@ -842,7 +928,7 @@ template <typename ProtoT>
libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
if (file_size == header_->rewind_offset) {
- // No changes made, don't need to update the checksum.
+ // No new protos appended, don't need to update the checksum.
return libtextclassifier3::Status::OK;
}
diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
index 3a9060d..fad5248 100644
--- a/icing/file/file-backed-proto-log_test.cc
+++ b/icing/file/file-backed-proto-log_test.cc
@@ -48,7 +48,10 @@ class FileBackedProtoLogTest : public ::testing::Test {
// https://stackoverflow.com/a/47368753
FileBackedProtoLogTest() {}
- void SetUp() override { file_path_ = GetTestTempDir() + "/proto_log"; }
+ void SetUp() override {
+ file_path_ = GetTestTempDir() + "/proto_log";
+ filesystem_.DeleteFile(file_path_.c_str());
+ }
void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
@@ -93,7 +96,7 @@ TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
@@ -110,7 +113,7 @@ TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write a proto
DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
@@ -144,7 +147,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/false, max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write the first proto
DocumentProto document1 =
@@ -191,7 +194,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/false, max_proto_size_)));
auto recreated_proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write a third proto
DocumentProto document3 =
@@ -213,7 +216,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/true, max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write the first proto
DocumentProto document1 =
@@ -260,7 +263,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
FileBackedProtoLog<DocumentProto>::Options(
/*compress_in=*/true, max_proto_size_)));
auto recreated_proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write a third proto
DocumentProto document3 =
@@ -360,7 +363,7 @@ TEST_F(FileBackedProtoLogTest, PersistToDisk) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Write and persist the first proto
ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
@@ -430,7 +433,7 @@ TEST_F(FileBackedProtoLogTest, Iterator) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
{
// Empty iterator
@@ -481,7 +484,7 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
ICING_EXPECT_OK(proto_log->WriteProto(document));
@@ -499,7 +502,7 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ ASSERT_FALSE(create_result.data_loss);
// Checksum should be consistent across instances
EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
@@ -514,6 +517,166 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
}
}
+TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Writes and erases proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // Checks if the erased area is set to 0.
+ int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
+ MemoryMappedFile mmapped_file(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY);
+
+ // document1_offset + sizeof(int) is the start byte of the proto where
+ // sizeof(int) is the size of the proto metadata.
+ mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1);
+ for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
+ ASSERT_THAT(mmapped_file.region()[i], Eq(0));
+ }
+}
+
+TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Writes 2 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Erases the first proto
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // The first proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // The second proto should be returned.
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+}
+
+TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace", "uri3").Build();
+ DocumentProto document4 =
+ DocumentBuilder().SetKey("namespace", "uri4").Build();
+
+ int64_t document2_offset;
+ int64_t document3_offset;
+
+ {
+ // Erase data after the rewind position. This won't update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Writes 3 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
+ proto_log->WriteProto(document3));
+
+ // Erases the 1st proto, checksum won't be updated immediately because the
+ // rewind position is 0.
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(2293202502))));
+ } // New checksum is updated in destructor.
+
+ {
+ // Erase data before the rewind position. This will update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Erases the 2nd proto that is now before the rewind position. Checksum is
+ // updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(639634028))));
+ }
+
+ {
+ // Append data and erase data before the rewind position. This will update
+ // the checksum twice: in EraseProto() and destructor.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.data_loss);
+
+ // Append a new document which is after the rewind position.
+ ICING_ASSERT_OK(proto_log->WriteProto(document4));
+
+ // Erases the 3rd proto that is now before the rewind position. Checksum is
+ // updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(1990198693))));
+ } // Checksum is updated with the newly appended document.
+
+ {
+ // A successful creation means that the checksum matches.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ FileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ FileBackedProtoLog<DocumentProto>::Options(compress_,
+ max_proto_size_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.data_loss);
+ }
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index e4ec0cd..eb89db8 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -187,7 +187,7 @@ class FileBackedVector {
//
// Returns:
// OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
- libtextclassifier3::Status TruncateTo(int32_t len);
+ libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
// Flushes content to underlying file.
//
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index c973885..5e0a46e 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -59,6 +59,7 @@
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -148,30 +149,31 @@ std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
void TransformStatus(const libtextclassifier3::Status& internal_status,
StatusProto* status_proto) {
+ StatusProto::Code code;
switch (internal_status.CanonicalCode()) {
case libtextclassifier3::StatusCode::OK:
- status_proto->set_code(StatusProto::OK);
+ code = StatusProto::OK;
break;
case libtextclassifier3::StatusCode::DATA_LOSS:
- status_proto->set_code(StatusProto::WARNING_DATA_LOSS);
+ code = StatusProto::WARNING_DATA_LOSS;
break;
case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
- status_proto->set_code(StatusProto::INVALID_ARGUMENT);
+ code = StatusProto::INVALID_ARGUMENT;
break;
case libtextclassifier3::StatusCode::NOT_FOUND:
- status_proto->set_code(StatusProto::NOT_FOUND);
+ code = StatusProto::NOT_FOUND;
break;
case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
- status_proto->set_code(StatusProto::FAILED_PRECONDITION);
+ code = StatusProto::FAILED_PRECONDITION;
break;
case libtextclassifier3::StatusCode::ABORTED:
- status_proto->set_code(StatusProto::ABORTED);
+ code = StatusProto::ABORTED;
break;
case libtextclassifier3::StatusCode::INTERNAL:
// TODO(b/147699081): Cleanup our internal use of INTERNAL since it
// doesn't match with what it *should* indicate as described in
// go/icing-library-apis.
- status_proto->set_code(StatusProto::INTERNAL);
+ code = StatusProto::INTERNAL;
break;
case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
// TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
@@ -179,17 +181,35 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
// internally to indicate other resources are exhausted (e.g.
// DocHitInfos) - although none of these are exposed through the API.
// Consider separating the two cases out more clearly.
- status_proto->set_code(StatusProto::OUT_OF_SPACE);
+ code = StatusProto::OUT_OF_SPACE;
break;
- default:
+ case libtextclassifier3::StatusCode::ALREADY_EXISTS:
+ code = StatusProto::ALREADY_EXISTS;
+ break;
+ case libtextclassifier3::StatusCode::CANCELLED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNKNOWN:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::PERMISSION_DENIED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::OUT_OF_RANGE:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNIMPLEMENTED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNAVAILABLE:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNAUTHENTICATED:
// Other internal status codes aren't supported externally yet. If it
// should be supported, add another switch-case above.
- ICING_LOG(FATAL) << IcingStringUtil::StringPrintf(
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
"Internal status code %d not supported in the external API",
internal_status.error_code());
+ code = StatusProto::UNKNOWN;
break;
}
-
+ status_proto->set_code(code);
status_proto->set_message(internal_status.error_message());
}
@@ -681,12 +701,14 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
// that can support error logging.
libtextclassifier3::Status status =
document_store_->DeleteByNamespace(name_space);
- TransformStatus(status, result_status);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete Namespace: " << name_space;
+ TransformStatus(status, result_status);
return delete_result;
}
+
+ result_status->set_code(StatusProto::OK);
return delete_result;
}
@@ -707,15 +729,82 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
// that can support error logging.
libtextclassifier3::Status status =
document_store_->DeleteBySchemaType(schema_type);
- TransformStatus(status, result_status);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete SchemaType: " << schema_type;
+ TransformStatus(status, result_status);
return delete_result;
}
+
+ result_status->set_code(StatusProto::OK);
return delete_result;
}
+DeleteResultProto IcingSearchEngine::DeleteByQuery(
+ const SearchSpecProto& search_spec) {
+ ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
+ << " from doc store";
+
+ DeleteResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
+ libtextclassifier3::Status status =
+ ValidateSearchSpec(search_spec, performance_configuration_);
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+
+ // Gets unordered results from query processor
+ auto query_processor_or = QueryProcessor::Create(
+ index_.get(), language_segmenter_.get(), normalizer_.get(),
+ document_store_.get(), schema_store_.get(), clock_.get());
+ if (!query_processor_or.ok()) {
+ TransformStatus(query_processor_or.status(), result_status);
+ return result_proto;
+ }
+ std::unique_ptr<QueryProcessor> query_processor =
+ std::move(query_processor_or).ValueOrDie();
+
+ auto query_results_or = query_processor->ParseSearch(search_spec);
+ if (!query_results_or.ok()) {
+ TransformStatus(query_results_or.status(), result_status);
+ return result_proto;
+ }
+ QueryProcessor::QueryResults query_results =
+ std::move(query_results_or).ValueOrDie();
+
+ ICING_LOG(ERROR) << "Deleting the docs that matched the query.";
+ bool found_results = false;
+ while (query_results.root_iterator->Advance().ok()) {
+ ICING_LOG(ERROR)
+ << "Deleting doc "
+ << query_results.root_iterator->doc_hit_info().document_id();
+ found_results = true;
+ status = document_store_->Delete(
+ query_results.root_iterator->doc_hit_info().document_id());
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+ }
+ if (found_results) {
+ result_proto.mutable_status()->set_code(StatusProto::OK);
+ } else {
+ result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ result_proto.mutable_status()->set_message(
+ "No documents matched the query to delete by!");
+ }
+ return result_proto;
+}
+
PersistToDiskResultProto IcingSearchEngine::PersistToDisk() {
ICING_VLOG(1) << "Persisting data to disk";
@@ -1147,6 +1236,9 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
// Ensures that current directory is still present.
if (!filesystem_->CreateDirectoryRecursively(
current_document_dir.c_str())) {
+ // Can't even create the old directory. Mark as uninitialized and return
+ // INTERNAL.
+ initialized_ = false;
return absl_ports::InternalError(
"Failed to create file directory for document store");
}
@@ -1159,6 +1251,9 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
// TODO(b/144458732): Implement a more robust version of
// TC_ASSIGN_OR_RETURN that can support error logging.
if (!document_store_or.ok()) {
+ // Unable to create DocumentStore from the old file. Mark as uninitialized
+ // and return INTERNAL.
+ initialized_ = false;
ICING_LOG(ERROR) << "Failed to create document store instance";
return absl_ports::Annotate(
absl_ports::InternalError("Failed to create document store instance"),
@@ -1173,13 +1268,18 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
}
// Recreates the doc store instance
- ICING_ASSIGN_OR_RETURN(
- document_store_,
+ auto document_store_or =
DocumentStore::Create(filesystem_.get(), current_document_dir,
- clock_.get(), schema_store_.get()),
- absl_ports::InternalError(
- "Document store has been optimized, but a valid document store "
- "instance can't be created"));
+ clock_.get(), schema_store_.get());
+ if (!document_store_or.ok()) {
+ // Unable to create DocumentStore from the new file. Mark as uninitialized
+ // and return INTERNAL.
+ initialized_ = false;
+ return absl_ports::InternalError(
+ "Document store has been optimized, but a valid document store "
+ "instance can't be created");
+ }
+ document_store_ = std::move(document_store_or).ValueOrDie();
// Deletes tmp directory
if (!filesystem_->DeleteDirectoryRecursively(
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 6ae76d7..55d6b2f 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -128,6 +128,9 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same
+ // type or contains a type that has multiple properties with the same
+ // name.
// INVALID_ARGUMENT if 'new_schema' is invalid
// FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
// has not been initialized yet.
@@ -256,6 +259,21 @@ class IcingSearchEngine {
DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
ICING_LOCKS_EXCLUDED(mutex_);
+ // Deletes all Documents that match the query specified in search_spec. Delete
+ // changes are automatically applied to disk, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND if the query doesn't match any documents
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
+ // INTERNAL_ERROR on IO error
+ DeleteResultProto DeleteByQuery(const SearchSpecProto& search_spec)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
// Retrieves, scores, ranks, and returns the results according to the specs.
// Results can be empty. If there're multiple pages of results,
// SearchResultProto.next_page_token will be populated and that can be used to
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index b0946c9..5a8bb80 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -55,6 +55,7 @@ using ::testing::HasSubstr;
using ::testing::IsEmpty;
using ::testing::Lt;
using ::testing::Matcher;
+using ::testing::Ne;
using ::testing::Return;
using ::testing::SizeIs;
using ::testing::StrEq;
@@ -470,6 +471,163 @@ TEST_F(IcingSearchEngineTest, FailToWriteSchema) {
HasSubstr("Unable to open file for write"));
}
+TEST_F(IcingSearchEngineTest, SetSchemaDelete2) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // 1. Create a schema with an Email type with properties { "title", "body"}
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+
+ // 2. Add an email document
+ DocumentProto doc = DocumentBuilder()
+ .SetKey("emails", "email#1")
+ .SetSchema("Email")
+ .AddStringProperty("title", "Hello world.")
+ .AddStringProperty("body", "Goodnight Moon.")
+ .Build();
+ EXPECT_THAT(icing.Put(std::move(doc)).status().code(), Eq(StatusProto::OK));
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // 3. Set a schema that deletes email. This should fail.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Message");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema, false).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ // 4. Try to delete by email type.
+ EXPECT_THAT(icing.DeleteBySchemaType("Email").status().code(),
+ Eq(StatusProto::OK));
+ }
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaDelete) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // 1. Create a schema with an Email type with properties { "title", "body"}
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+
+ // 2. Add an email document
+ DocumentProto doc = DocumentBuilder()
+ .SetKey("emails", "email#1")
+ .SetSchema("Email")
+ .AddStringProperty("title", "Hello world.")
+ .AddStringProperty("body", "Goodnight Moon.")
+ .Build();
+ EXPECT_THAT(icing.Put(std::move(doc)).status().code(), Eq(StatusProto::OK));
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // 3. Set a schema that deletes email. This should fail.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Message");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema, true).status().code(),
+ Eq(StatusProto::OK));
+
+ // 4. Try to delete by email type.
+ EXPECT_THAT(icing.DeleteBySchemaType("Email").status().code(),
+ Eq(StatusProto::NOT_FOUND));
+ }
+}
+
+TEST_F(IcingSearchEngineTest, SetSchemaDuplicateTypesReturnsAlreadyExists) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // Create a schema with types { "Email", "Message" and "Email" }
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ type = schema.add_types();
+ type->set_schema_type("Message");
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ *schema.add_types() = schema.types(0);
+
+ EXPECT_THAT(icing.SetSchema(schema).status().code(),
+ Eq(StatusProto::ALREADY_EXISTS));
+}
+
+TEST_F(IcingSearchEngineTest,
+ SetSchemaDuplicatePropertiesReturnsAlreadyExists) {
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // Create a schema with an Email type with properties { "title", "body" and
+ // "title" }
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status().code(),
+ Eq(StatusProto::ALREADY_EXISTS));
+}
+
TEST_F(IcingSearchEngineTest, SetSchema) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
@@ -1519,6 +1677,82 @@ TEST_F(IcingSearchEngineTest, DeleteShouldWorkAfterOptimization) {
EqualsProto(expected_get_result_proto));
}
+TEST_F(IcingSearchEngineTest, OptimizationFailureUninitializesIcing) {
+ // Setup filesystem to fail
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ bool just_swapped_files = false;
+ auto create_dir_lambda = [this, &just_swapped_files](const char* dir_name) {
+ if (just_swapped_files) {
+ // We should fail the first call immediately after swapping files.
+ just_swapped_files = false;
+ return false;
+ }
+ return filesystem()->CreateDirectoryRecursively(dir_name);
+ };
+ ON_CALL(*mock_filesystem, CreateDirectoryRecursively)
+ .WillByDefault(create_dir_lambda);
+ auto swap_lambda = [&just_swapped_files](const char* first_dir,
+ const char* second_dir) {
+ just_swapped_files = true;
+ return false;
+ };
+ ON_CALL(*mock_filesystem, SwapFiles).WillByDefault(swap_lambda);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<FakeClock>());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+
+ // The mocks should cause an unrecoverable error during Optimize - returning
+ // INTERNAL.
+ ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::INTERNAL));
+
+ // Ordinary operations should fail safely.
+ SchemaProto simple_schema;
+ auto type = simple_schema.add_types();
+ type->set_schema_type("type0");
+ auto property = type->add_properties();
+ property->set_property_name("prop0");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ DocumentProto simple_doc = DocumentBuilder()
+ .SetKey("namespace0", "uri0")
+ .SetSchema("type0")
+ .AddStringProperty("prop0", "foo")
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ ResultSpecProto result_spec;
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+ EXPECT_THAT(icing.SetSchema(simple_schema).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Put(simple_doc).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ icing.Get(simple_doc.namespace_(), simple_doc.uri()).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ // Reset should get icing back to a safe (empty) and working state.
+ EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(simple_schema).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(simple_doc).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Get(simple_doc.namespace_(), simple_doc.uri()).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(
+ icing.Search(search_spec, scoring_spec, result_spec).status().code(),
+ Eq(StatusProto::OK));
+}
+
TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
SchemaProto schema;
// Add an email type
@@ -1528,6 +1762,10 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
property->set_property_name("subject");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
// Add an message type
type = schema.add_types();
type->set_schema_type("message");
@@ -1535,6 +1773,10 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
property->set_property_name("body");
property->set_data_type(PropertyConfigProto::DataType::STRING);
property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_indexing_config()->set_tokenizer_type(
+ IndexingConfig::TokenizerType::PLAIN);
DocumentProto document1 =
DocumentBuilder()
.SetKey("namespace1", "uri1")
@@ -1550,6 +1792,74 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
.SetCreationTimestampMs(kDefaultCreationTimestampMs)
.Build();
IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete the first type. The first doc should be irretrievable. The
+ // second should still be present.
+ EXPECT_THAT(icing.DeleteBySchemaType("message").status().code(),
+ Eq(StatusProto::OK));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ // Search for "message", only document2 should show up.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("message");
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteSchemaTypeByQuery) {
+ SchemaProto schema = CreateMessageSchema();
+ // Add an email type
+ SchemaProto tmp = CreateEmailSchema();
+ *schema.add_types() = tmp.types(0);
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema(schema.types(0).schema_type())
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema(schema.types(1).schema_type())
+ .AddStringProperty("subject", "subject subject2")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
@@ -1567,7 +1877,9 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
// Delete the first type. The first doc should be irretrievable. The
// second should still be present.
- EXPECT_THAT(icing.DeleteBySchemaType("message").status().code(),
+ SearchSpecProto search_spec;
+ search_spec.add_schema_type_filters(schema.types(0).schema_type());
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
Eq(StatusProto::OK));
expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
@@ -1582,6 +1894,18 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
*expected_get_result_proto.mutable_document() = document2;
EXPECT_THAT(icing.Get("namespace2", "uri2"),
EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
}
TEST_F(IcingSearchEngineTest, DeleteByNamespace) {
@@ -1594,6 +1918,89 @@ TEST_F(IcingSearchEngineTest, DeleteByNamespace) {
.Build();
DocumentProto document2 =
DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace3", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace1", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(icing.Get("namespace3", "uri3"),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete namespace1. Document1 and document2 should be irretrievable.
+ // Document3 should still be present.
+ EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(),
+ Eq(StatusProto::OK));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri2) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(icing.Get("namespace1", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(icing.Get("namespace3", "uri3"),
+ EqualsProto(expected_get_result_proto));
+
+ // Search for "message", only document3 should show up.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("message");
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteNamespaceByQuery) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
.SetKey("namespace2", "uri2")
.SetSchema("Message")
.AddStringProperty("body", "message body2")
@@ -1619,7 +2026,76 @@ TEST_F(IcingSearchEngineTest, DeleteByNamespace) {
// Delete the first namespace. The first doc should be irretrievable. The
// second should still be present.
- EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(),
+ SearchSpecProto search_spec;
+ search_spec.add_namespace_filters("namespace1");
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
+ Eq(StatusProto::OK));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteByQuery) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs containing 'body1'. The first doc should be irretrievable.
+ // The second should still be present.
+ SearchSpecProto search_spec;
+ search_spec.set_query("body1");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
Eq(StatusProto::OK));
expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
@@ -1634,6 +2110,86 @@ TEST_F(IcingSearchEngineTest, DeleteByNamespace) {
*expected_get_result_proto.mutable_document() = document2;
EXPECT_THAT(icing.Get("namespace2", "uri2"),
EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions());
+ EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
+ Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs containing 'foo', which should be none of them. Both docs
+ // should still be present.
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(),
+ Eq(StatusProto::NOT_FOUND));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(icing.Get("namespace1", "uri1"),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance()),
+ EqualsProto(expected_search_result_proto));
}
TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) {
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 00d116f..eb01731 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -31,6 +31,7 @@
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -192,8 +193,9 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
CleanUp(filesystem, index_dir);
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -239,8 +241,9 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
CleanUp(filesystem, index_dir);
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -287,8 +290,9 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
CleanUp(filesystem, index_dir);
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
@@ -335,8 +339,9 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
CleanUp(filesystem, index_dir);
std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
std::unique_ptr<IndexProcessor> index_processor =
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 8dfb9c2..824c440 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -47,6 +47,7 @@
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -91,8 +92,10 @@ class IndexProcessorTest : public Test {
ICING_ASSERT_OK_AND_ASSIGN(index_,
Index::Create(options, &icing_filesystem_));
- ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
ICING_ASSERT_OK_AND_ASSIGN(
normalizer_,
diff --git a/icing/index/index.cc b/icing/index/index.cc
index d4a2508..0b014d9 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -24,8 +24,8 @@
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/index/hit/hit.h"
-#include "icing/index/iterator/doc-hit-info-iterator-term.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
#include "icing/index/lite/lite-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-property-id.h"
@@ -102,10 +102,10 @@ Index::GetIterator(const std::string& term, SectionIdMask section_id_mask,
TermMatchType::Code term_match_type) {
switch (term_match_type) {
case TermMatchType::EXACT_ONLY:
- return std::make_unique<DocHitInfoIteratorTermExact>(
+ return std::make_unique<DocHitInfoIteratorTermLiteExact>(
term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
case TermMatchType::PREFIX:
- return std::make_unique<DocHitInfoIteratorTermPrefix>(
+ return std::make_unique<DocHitInfoIteratorTermLitePrefix>(
term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
default:
return absl_ports::InvalidArgumentError(
@@ -159,13 +159,18 @@ libtextclassifier3::Status Index::Editor::AddHit(const char* term,
Hit::Score score) {
// Step 1: See if this term is already in the lexicon
uint32_t tvi;
- auto tvi_or = lite_index_->FindTerm(term);
+ auto tvi_or = lite_index_->GetTermId(term);
// Step 2: Update the lexicon, either add the term or update its properties
if (tvi_or.ok()) {
+ tvi = tvi_or.ValueOrDie();
+ if (seen_tokens_.find(tvi) != seen_tokens_.end()) {
+ ICING_VLOG(1) << "A hit for term " << term
+ << " has already been added. Skipping.";
+ return libtextclassifier3::Status::OK;
+ }
ICING_VLOG(1) << "Term " << term
<< " is already present in lexicon. Updating.";
- tvi = tvi_or.ValueOrDie();
// Already in the lexicon. Just update the properties.
ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties(
tvi, term_match_type_ == TermMatchType::PREFIX, namespace_id_));
@@ -175,6 +180,7 @@ libtextclassifier3::Status Index::Editor::AddHit(const char* term,
ICING_ASSIGN_OR_RETURN(
tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_));
}
+ seen_tokens_.insert(tvi);
// Step 3: Add the hit itself
Hit hit(section_id_, document_id_, score,
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index 070e82a..f7ca285 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -37,6 +37,7 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
namespace icing {
namespace lib {
@@ -48,6 +49,7 @@ using ::testing::Eq;
using ::testing::Gt;
using ::testing::IsEmpty;
using ::testing::IsTrue;
+using ::testing::Ne;
using ::testing::NiceMock;
using ::testing::Not;
using ::testing::SizeIs;
@@ -255,11 +257,16 @@ TEST_F(IndexTest, MultiHitSectionRestrict) {
}
TEST_F(IndexTest, SingleHitDedupeIndex) {
+ Crc32 empty_crc = index_->ComputeChecksum();
// Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ Crc32 first_hit_crc = index_->ComputeChecksum();
+ EXPECT_THAT(first_hit_crc.Get(), Ne(empty_crc.Get()));
EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ Crc32 second_hit_crc = index_->ComputeChecksum();
+ EXPECT_THAT(second_hit_crc.Get(), Eq(first_hit_crc.Get()));
// Assert
ICING_ASSERT_OK_AND_ASSIGN(
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc
index 482a5ab..c6cb86d 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc
@@ -82,12 +82,10 @@ libtextclassifier3::Status DocHitInfoIteratorFilter::Advance() {
"Couldn't get current time. Try again in a bit");
}
- if (options_.filter_deleted) {
- if (!document_store_.DoesDocumentExist(
- delegate_->doc_hit_info().document_id())) {
- // Document doesn't exist, keep searching
- return Advance();
- }
+ if (!document_store_.DoesDocumentExist(
+ delegate_->doc_hit_info().document_id())) {
+ // Document doesn't exist, keep searching
+ return Advance();
}
// Try to get the DocumentFilterData
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h
index bf027e4..9119610 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.h
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.h
@@ -37,10 +37,6 @@ namespace lib {
class DocHitInfoIteratorFilter : public DocHitInfoIterator {
public:
struct Options {
- // Filter out/don't return DocHitInfos that are associated with nonexistent
- // Documents.
- bool filter_deleted = true;
-
// List of namespaces that documents must have. An empty vector means that
// all namespaces are valid, and no documents will be filtered out.
//
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
index e769013..9eb147a 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
@@ -105,33 +105,6 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, EmptyOriginalIterator) {
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
-TEST_F(DocHitInfoIteratorDeletedFilterTest, TurnOffDeletedFilterOk) {
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- document_store_->Put(test_document1_));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- document_store_->Put(test_document2_));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- document_store_->Put(test_document3_));
-
- // Deletes test document 2
- ICING_ASSERT_OK(document_store_->Delete(test_document2_.namespace_(),
- test_document2_.uri()));
-
- std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
- DocHitInfo(document_id2),
- DocHitInfo(document_id3)};
- std::unique_ptr<DocHitInfoIterator> original_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
-
- options_.filter_deleted = false;
- DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
-
- EXPECT_THAT(GetDocumentIds(&filtered_iterator),
- ElementsAre(document_id1, document_id2, document_id3));
-}
-
TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) {
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(test_document1_));
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
index 97ca3c4..1f1c296 100644
--- a/icing/index/iterator/doc-hit-info-iterator-term.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/iterator/doc-hit-info-iterator-term.h"
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
#include <cstdint>
@@ -40,7 +40,7 @@ std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
} // namespace
-libtextclassifier3::Status DocHitInfoIteratorTerm::Advance() {
+libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
if (cached_hits_idx_ == -1) {
ICING_RETURN_IF_ERROR(RetrieveMoreHits());
} else {
@@ -59,9 +59,9 @@ libtextclassifier3::Status DocHitInfoIteratorTerm::Advance() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status DocHitInfoIteratorTermExact::RetrieveMoreHits() {
+libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
// Exact match only. All hits in lite lexicon are exact.
- ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
+ ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
ICING_ASSIGN_OR_RETURN(uint32_t term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
lite_index_->AppendHits(term_id, section_restrict_mask_,
@@ -70,12 +70,13 @@ libtextclassifier3::Status DocHitInfoIteratorTermExact::RetrieveMoreHits() {
return libtextclassifier3::Status::OK;
}
-std::string DocHitInfoIteratorTermExact::ToString() const {
+std::string DocHitInfoIteratorTermLiteExact::ToString() const {
return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
term_);
}
-libtextclassifier3::Status DocHitInfoIteratorTermPrefix::RetrieveMoreHits() {
+libtextclassifier3::Status
+DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
// Take union of lite terms.
int term_len = term_.length();
int terms_matched = 0;
@@ -97,7 +98,7 @@ libtextclassifier3::Status DocHitInfoIteratorTermPrefix::RetrieveMoreHits() {
return libtextclassifier3::Status::OK;
}
-void DocHitInfoIteratorTermPrefix::SortAndDedupeDocumentIds() {
+void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
// Re-sort cached document_ids and merge sections.
sort(cached_hits_.begin(), cached_hits_.end());
@@ -116,7 +117,7 @@ void DocHitInfoIteratorTermPrefix::SortAndDedupeDocumentIds() {
cached_hits_.resize(idx + 1);
}
-std::string DocHitInfoIteratorTermPrefix::ToString() const {
+std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
term_, "*");
}
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
index 21d1dd6..bd2de6d 100644
--- a/icing/index/iterator/doc-hit-info-iterator-term.h
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
-#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
#include <cstdint>
#include <vector>
@@ -28,11 +28,12 @@
namespace icing {
namespace lib {
-class DocHitInfoIteratorTerm : public DocHitInfoIterator {
+class DocHitInfoIteratorTermLite : public DocHitInfoIterator {
public:
- explicit DocHitInfoIteratorTerm(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index, const std::string term,
- SectionIdMask section_restrict_mask)
+ explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
: term_(term),
lite_index_(lite_index),
cached_hits_idx_(-1),
@@ -66,14 +67,14 @@ class DocHitInfoIteratorTerm : public DocHitInfoIterator {
const SectionIdMask section_restrict_mask_;
};
-class DocHitInfoIteratorTermExact : public DocHitInfoIteratorTerm {
+class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
public:
- explicit DocHitInfoIteratorTermExact(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index,
- const std::string& term,
- SectionIdMask section_id_mask)
- : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
- section_id_mask) {}
+ explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_id_mask)
+ : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+ section_id_mask) {}
std::string ToString() const override;
@@ -81,14 +82,14 @@ class DocHitInfoIteratorTermExact : public DocHitInfoIteratorTerm {
libtextclassifier3::Status RetrieveMoreHits() override;
};
-class DocHitInfoIteratorTermPrefix : public DocHitInfoIteratorTerm {
+class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
public:
- explicit DocHitInfoIteratorTermPrefix(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index,
- const std::string& term,
- SectionIdMask section_id_mask)
- : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
- section_id_mask) {}
+ explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ SectionIdMask section_id_mask)
+ : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+ section_id_mask) {}
std::string ToString() const override;
@@ -105,4 +106,4 @@ class DocHitInfoIteratorTermPrefix : public DocHitInfoIteratorTerm {
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index a72402e..89240ee 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -65,8 +65,8 @@ size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); }
} // namespace
-const LiteIndex::Element::Value LiteIndex::Element::kInvalidValue =
- LiteIndex::Element(0, Hit()).value();
+const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
+ TermIdHitPair(0, Hit()).value();
libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
@@ -163,7 +163,7 @@ libtextclassifier3::Status LiteIndex::Initialize() {
header_->Reset();
if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
- sizeof(Element::Value), header_->cur_size(),
+ sizeof(TermIdHitPair::Value), header_->cur_size(),
options_.hit_buffer_size, &hit_buffer_crc_, true)) {
status = absl_ports::InternalError("Failed to initialize new hit buffer");
goto error;
@@ -177,7 +177,7 @@ libtextclassifier3::Status LiteIndex::Initialize() {
header_mmap_.address()));
if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
- sizeof(Element::Value), header_->cur_size(),
+ sizeof(TermIdHitPair::Value), header_->cur_size(),
options_.hit_buffer_size, &hit_buffer_crc_, true)) {
status = absl_ports::InternalError(
"Failed to re-initialize existing hit buffer");
@@ -312,20 +312,21 @@ libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
header_->set_last_added_docid(hit.document_id());
- Element elt(term_id, hit);
+ TermIdHitPair term_id_hit_pair(term_id, hit);
uint32_t cur_size = header_->cur_size();
- Element::Value* valp = hit_buffer_.GetMutableMem<Element::Value>(cur_size, 1);
+ TermIdHitPair::Value* valp =
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1);
if (valp == nullptr) {
return absl_ports::ResourceExhaustedError(
"Allocating more space in hit buffer failed!");
}
- *valp = elt.value();
+ *valp = term_id_hit_pair.value();
header_->set_cur_size(cur_size + 1);
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm(
+libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
const std::string& term) const {
char dummy;
uint32_t tvi;
@@ -336,16 +337,17 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm(
return tvi;
}
-uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
- bool only_from_prefix_sections,
- std::vector<DocHitInfo>* hits_out) {
- uint32_t count = 0;
+int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ std::vector<DocHitInfo>* hits_out) {
+ int count = 0;
DocumentId last_document_id = kInvalidDocumentId;
for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
- Element elt(hit_buffer_.array_cast<Element>()[idx]);
- if (elt.term_id() != term_id) break;
+ TermIdHitPair term_id_hit_pair(
+ hit_buffer_.array_cast<TermIdHitPair>()[idx]);
+ if (term_id_hit_pair.term_id() != term_id) break;
- const Hit& hit = elt.hit();
+ const Hit& hit = term_id_hit_pair.hit();
// Check sections.
if (((1u << hit.section_id()) & section_id_mask) == 0) {
continue;
@@ -356,7 +358,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
}
DocumentId document_id = hit.document_id();
if (document_id != last_document_id) {
- count++;
+ ++count;
if (hits_out != nullptr) {
hits_out->push_back(DocHitInfo(document_id));
}
@@ -369,7 +371,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
return count;
}
-uint32_t LiteIndex::CountHits(uint32_t term_id) {
+int LiteIndex::CountHits(uint32_t term_id) {
return AppendHits(term_id, kSectionIdMaskAll,
/*only_from_prefix_sections=*/false,
/*hits_out=*/nullptr);
@@ -421,8 +423,8 @@ uint32_t LiteIndex::Seek(uint32_t term_id) {
IcingTimer timer;
auto* array_start =
- hit_buffer_.GetMutableMem<Element::Value>(0, header_->cur_size());
- Element::Value* sort_start = array_start + header_->searchable_end();
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
+ TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
std::sort(sort_start, array_start + header_->cur_size());
// Now merge with previous region. Since the previous region is already
@@ -445,11 +447,13 @@ uint32_t LiteIndex::Seek(uint32_t term_id) {
// Binary search for our term_id. Make sure we get the first
// element. Using kBeginSortValue ensures this for the hit value.
- Element elt(term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore));
+ TermIdHitPair term_id_hit_pair(
+ term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore));
- const Element::Value* array = hit_buffer_.array_cast<Element::Value>();
- const Element::Value* ptr =
- std::lower_bound(array, array + header_->cur_size(), elt.value());
+ const TermIdHitPair::Value* array =
+ hit_buffer_.array_cast<TermIdHitPair::Value>();
+ const TermIdHitPair::Value* ptr = std::lower_bound(
+ array, array + header_->cur_size(), term_id_hit_pair.value());
return ptr - array;
}
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index b60a947..27ccf33 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -30,6 +30,7 @@
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/hit/hit.h"
+#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/legacy/index/icing-array-storage.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
@@ -49,49 +50,6 @@ namespace lib {
class LiteIndex {
public:
// An entry in the hit buffer.
- class Element {
- public:
- // Layout bits: 24 termid + 32 hit value + 8 hit score.
- using Value = uint64_t;
-
- static constexpr int kTermIdBits = 24;
- static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
- static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8;
-
- static const Value kInvalidValue;
-
- explicit Element(Value v = kInvalidValue) : value_(v) {}
-
- Element(uint32_t term_id, const Hit& hit) {
- static_assert(
- kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8,
- "LiteIndexElementTooBig");
-
- value_ = 0;
- // Term id goes into the most significant bits because it takes
- // precedent in sorts.
- bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits,
- &value_);
- bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_);
- bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_);
- }
-
- uint32_t term_id() const {
- return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits,
- kTermIdBits);
- }
-
- Hit hit() const {
- return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits),
- bit_util::BitfieldGet(value_, 0, kHitScoreBits));
- }
-
- Value value() const { return value_; }
-
- private:
- Value value_;
- };
-
using Options = IcingLiteIndexOptions;
// Updates checksum of subcomponents.
@@ -126,7 +84,7 @@ class LiteIndex {
Crc32 ComputeChecksum();
// Returns term_id if term found, NOT_FOUND otherwise.
- libtextclassifier3::StatusOr<uint32_t> FindTerm(
+ libtextclassifier3::StatusOr<uint32_t> GetTermId(
const std::string& term) const;
// Returns an iterator for all terms for which 'prefix' is a prefix.
@@ -170,25 +128,89 @@ class LiteIndex {
NamespaceId namespace_id);
// Append hit to buffer. term_id must be encoded using the same term_id_codec
- // supplied to the index constructor. Returns non-OK if hit cannot be added
- // (either due to hit buffer or file system capacity reached).
+ // supplied to the index constructor.
+ // RETURNS:
+ // - OK if hit was successfully added
+ // - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer
+ // or file system capacity reached).
libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit);
// Add all hits with term_id from the sections specified in section_id_mask,
// skipping hits in non-prefix sections if only_from_prefix_sections is true,
- // to hits_out.
- uint32_t AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
- bool only_from_prefix_sections,
- std::vector<DocHitInfo>* hits_out);
+ // to hits_out. If hits_out is nullptr, no hits will be added.
+ //
+ // Returns the number of hits that would be added to hits_out.
+ int AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ std::vector<DocHitInfo>* hits_out);
// Returns the hit count of the term.
- uint32_t CountHits(uint32_t term_id);
+ int CountHits(uint32_t term_id);
// Check if buffer has reached its capacity.
bool is_full() const;
+ bool empty() const { return size() == 0; }
+
+ uint32_t size() const { return header_->cur_size(); }
+
+ class const_iterator {
+ friend class LiteIndex;
+
+ public:
+ using iterator_category = std::forward_iterator_tag;
+ using value_type = TermIdHitPair;
+ using reference = const value_type&;
+ using pointer = const value_type*;
+
+ const_iterator() : const_iterator(nullptr, -1, -1) {}
+
+ reference operator*() const { return start_[position_]; }
+
+ pointer operator->() const { return start_ + position_; }
+
+ const_iterator& operator++() {
+ if (++position_ >= end_position_) {
+ start_ = nullptr;
+ position_ = -1;
+ end_position_ = -1;
+ }
+ return *this;
+ }
+
+ const_iterator operator++(int) {
+ auto tmp = *this;
+ ++*this;
+ return tmp;
+ }
+
+ bool operator!=(const const_iterator& rhs) { return !(*this == rhs); }
+
+ bool operator==(const const_iterator& rhs) {
+ return start_ == rhs.start_ && position_ == rhs.position_;
+ }
+
+ private:
+ explicit const_iterator(const TermIdHitPair* start, int position,
+ int end_position)
+ : start_(start), position_(position), end_position_(end_position) {}
+
+ const TermIdHitPair* start_;
+ int position_;
+ int end_position_;
+ };
+
+ const_iterator begin() const {
+ // If the LiteIndex is empty, just return end().
+ return empty() ? end()
+ : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0,
+ header_->cur_size());
+ }
+
+ const_iterator end() const { return const_iterator(); }
+
constexpr static uint32_t max_hit_buffer_size() {
- return std::numeric_limits<uint32_t>::max() / sizeof(LiteIndex::Element);
+ return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair);
}
// We keep track of the last added document_id. This is always the largest
diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h
new file mode 100644
index 0000000..191f766
--- /dev/null
+++ b/icing/index/lite/term-id-hit-pair.h
@@ -0,0 +1,80 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_TERM_ID_HIT_PAIR_H_
+#define ICING_INDEX_TERM_ID_HIT_PAIR_H_
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/index/hit/hit.h"
+#include "icing/util/bit-util.h"
+
+namespace icing {
+namespace lib {
+
+class TermIdHitPair {
+ public:
+ // Layout bits: 24 termid + 32 hit value + 8 hit score.
+ using Value = uint64_t;
+
+ static constexpr int kTermIdBits = 24;
+ static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
+ static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8;
+
+ static const Value kInvalidValue;
+
+ explicit TermIdHitPair(Value v = kInvalidValue) : value_(v) {}
+
+ TermIdHitPair(uint32_t term_id, const Hit& hit) {
+ static_assert(
+ kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8,
+ "TermIdHitPairTooBig");
+
+ value_ = 0;
+ // Term id goes into the most significant bits because it takes
+ // precedent in sorts.
+ bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits,
+ &value_);
+ bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_);
+ bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_);
+ }
+
+ uint32_t term_id() const {
+ return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits,
+ kTermIdBits);
+ }
+
+ Hit hit() const {
+ return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits),
+ bit_util::BitfieldGet(value_, 0, kHitScoreBits));
+ }
+
+ Value value() const { return value_; }
+
+ bool operator==(const TermIdHitPair& rhs) const {
+ return value_ == rhs.value_;
+ }
+
+ private:
+ Value value_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_TERM_ID_HIT_PAIR_H_
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc
new file mode 100644
index 0000000..0640135
--- /dev/null
+++ b/icing/index/main/doc-hit-info-iterator-term-main.cc
@@ -0,0 +1,166 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
+ std::string mask(kMaxSectionId + 1, '0');
+ for (SectionId i = kMaxSectionId; i >= 0; --i) {
+ if (section_id_mask & (1U << i)) {
+ mask[kMaxSectionId - i] = '1';
+ }
+ }
+ return mask;
+}
+
+} // namespace
+
+libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() {
+ if (posting_list_accessor_ == nullptr ||
+ cached_doc_hit_infos_idx_ == (cached_doc_hit_infos_.size() - 2)) {
+ // If we haven't retrieved any hits before or we've already returned all but
+ // the last cached hit, then go get some more!
+ // We hold back the last cached hit because it could have more hits on the
+ // next posting list in the chain.
+ ICING_RETURN_IF_ERROR(RetrieveMoreHits());
+ } else {
+ ++cached_doc_hit_infos_idx_;
+ }
+ if (cached_doc_hit_infos_idx_ == -1 ||
+ cached_doc_hit_infos_idx_ >= cached_doc_hit_infos_.size()) {
+ // Nothing more for the iterator to return. Set these members to invalid
+ // values.
+ doc_hit_info_ = DocHitInfo();
+ hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ doc_hit_info_ = cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_);
+ hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask();
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() {
+ DocHitInfo last_doc_hit_info;
+ if (!cached_doc_hit_infos_.empty()) {
+ last_doc_hit_info = cached_doc_hit_infos_.back();
+ }
+ cached_doc_hit_infos_idx_ = 0;
+ cached_doc_hit_infos_.clear();
+ if (last_doc_hit_info.document_id() != kInvalidDocumentId) {
+ // Carry over the last hit. It might need to be merged with the first hit of
+ // of the next posting list in the chain.
+ cached_doc_hit_infos_.push_back(last_doc_hit_info);
+ }
+ if (posting_list_accessor_ == nullptr) {
+ ICING_ASSIGN_OR_RETURN(posting_list_accessor_,
+ main_index_->GetAccessorForExactTerm(term_));
+ }
+
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ posting_list_accessor_->GetNextHitsBatch());
+ ++num_blocks_inspected_;
+ cached_doc_hit_infos_.reserve(hits.size() + 1);
+ for (const Hit& hit : hits) {
+ // Check sections.
+ if (((1u << hit.section_id()) & section_restrict_mask_) == 0) {
+ continue;
+ }
+ // We want exact hits, skip prefix-only hits.
+ if (hit.is_prefix_hit()) {
+ continue;
+ }
+ if (cached_doc_hit_infos_.empty() ||
+ hit.document_id() != cached_doc_hit_infos_.back().document_id()) {
+ cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id()));
+ }
+ cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score());
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermMainExact::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_);
+}
+
+libtextclassifier3::Status
+DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() {
+ DocHitInfo last_doc_hit_info;
+ if (!cached_doc_hit_infos_.empty()) {
+ last_doc_hit_info = cached_doc_hit_infos_.back();
+ }
+ cached_doc_hit_infos_idx_ = 0;
+ cached_doc_hit_infos_.clear();
+ if (last_doc_hit_info.document_id() != kInvalidDocumentId) {
+ // Carry over the last hit. It might need to be merged with the first hit of
+ // of the next posting list in the chain.
+ cached_doc_hit_infos_.push_back(last_doc_hit_info);
+ }
+
+ ++num_blocks_inspected_;
+ if (posting_list_accessor_ == nullptr) {
+ ICING_ASSIGN_OR_RETURN(
+ MainIndex::GetPrefixAccessorResult result,
+ main_index_->GetAccessorForPrefixTerm(term_));
+ posting_list_accessor_ = std::move(result.accessor);
+ exact_ = result.exact;
+ }
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ posting_list_accessor_->GetNextHitsBatch());
+ cached_doc_hit_infos_.reserve(hits.size());
+ for (const Hit& hit : hits) {
+ // Check sections.
+ if (((1u << hit.section_id()) & section_restrict_mask_) == 0) {
+ continue;
+ }
+ // If we only want hits from prefix sections.
+ if (!exact_ && !hit.is_in_prefix_section()) {
+ continue;
+ }
+ if (cached_doc_hit_infos_.empty() ||
+ hit.document_id() != cached_doc_hit_infos_.back().document_id()) {
+ cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id()));
+ }
+ cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score());
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermMainPrefix::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_, "*");
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h
new file mode 100644
index 0000000..1f77226
--- /dev/null
+++ b/icing/index/main/doc-hit-info-iterator-term-main.h
@@ -0,0 +1,114 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class DocHitInfoIteratorTermMain : public DocHitInfoIterator {
+ public:
+ explicit DocHitInfoIteratorTermMain(MainIndex* main_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
+ : term_(term),
+ main_index_(main_index),
+ cached_doc_hit_infos_idx_(-1),
+ num_advance_calls_(0),
+ num_blocks_inspected_(0),
+ next_posting_list_id_(PostingListIdentifier::kInvalid),
+ section_restrict_mask_(section_restrict_mask) {}
+
+ libtextclassifier3::Status Advance() override;
+
+ int32_t GetNumBlocksInspected() const override {
+ return num_blocks_inspected_;
+ }
+ int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
+
+ protected:
+ // Add DocHitInfos corresponding to term_ to cached_doc_hit_infos_.
+ virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
+
+ const std::string term_;
+ // The accessor of the posting list chain for the requested term.
+ std::unique_ptr<PostingListAccessor> posting_list_accessor_;
+
+ MainIndex* main_index_;
+ // Stores hits retrieved from the index. This may only be a subset of the hits
+ // that are present in the index. Current value pointed to by the Iterator is
+ // tracked by cached_doc_hit_infos_idx_.
+ std::vector<DocHitInfo> cached_doc_hit_infos_;
+ int cached_doc_hit_infos_idx_;
+ int num_advance_calls_;
+ int num_blocks_inspected_;
+ PostingListIdentifier next_posting_list_id_;
+ // Mask indicating which sections hits should be considered for.
+ // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
+ const SectionIdMask section_restrict_mask_;
+};
+
+class DocHitInfoIteratorTermMainExact : public DocHitInfoIteratorTermMain {
+ public:
+ explicit DocHitInfoIteratorTermMainExact(MainIndex* main_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
+ : DocHitInfoIteratorTermMain(main_index, term, section_restrict_mask) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+};
+
+class DocHitInfoIteratorTermMainPrefix : public DocHitInfoIteratorTermMain {
+ public:
+ explicit DocHitInfoIteratorTermMainPrefix(MainIndex* main_index,
+ const std::string& term,
+ SectionIdMask section_restrict_mask)
+ : DocHitInfoIteratorTermMain(main_index, term, section_restrict_mask) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+
+ private:
+ // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
+ // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
+ // merged.
+ void SortAndDedupeDocumentIds();
+ // Whether or not posting_list_accessor_ holds a posting list chain for
+ // 'term' or for a term for which 'term' is a prefix. This is necessary to
+ // determine whether to return hits that are not from a prefix section (hits
+ // not from a prefix section should only be returned if exact_ is true).
+ bool exact_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
diff --git a/icing/index/main/flash-index-storage-header.h b/icing/index/main/flash-index-storage-header.h
new file mode 100644
index 0000000..f81e99e
--- /dev/null
+++ b/icing/index/main/flash-index-storage-header.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
+#define ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+
+namespace icing {
+namespace lib {
+
+// The class used to manage the flash block that contains the header for
+// FlashIndexStorage. This contains information about the index blocks that
+// store the posting lists.
+class HeaderBlock {
+ public:
+ // The class used to access the actual header.
+ struct Header {
+ // A magic used to mark the beginning of a valid header.
+ static constexpr int kMagic = 0x6dfba6ae;
+ int magic;
+ int block_size;
+ int last_indexed_docid;
+ // The size of the index_block_infos array.
+ int num_index_block_infos;
+
+ struct IndexBlockInfo {
+ // The size of the posting lists that fit on all the index blocks in this
+ // chain. Each block on this posting list will have posting lists of size
+ // posting_list_bytes.
+ int posting_list_bytes;
+ // The block index of the first block in the free list chain.
+ int free_list_block_index;
+ };
+ // Variable-size array, num_index_block_infos long. Can have a max length
+ // of log(block_size). This array is used to maintain a free list for the
+ // available blocks.
+ IndexBlockInfo index_block_infos[0];
+ };
+
+ // Read HeaderBlock from the specified fd.
+ //
+ // RETURNS:
+ // - HeaderBlock, on success
+ // - INTERNAL if unable to read block_size bytes from fd.
+ static libtextclassifier3::StatusOr<HeaderBlock> Read(
+ const Filesystem* filesystem, int fd, int block_size) {
+ std::unique_ptr<uint8_t[]> buffer = std::make_unique<uint8_t[]>(block_size);
+ if (!filesystem->PRead(fd, buffer.get(), block_size, 0)) {
+ return absl_ports::InternalError("Unable to reader header block!");
+ }
+ return HeaderBlock(filesystem, std::move(buffer), block_size);
+ }
+
+ // Make a new HeaderBlock with the specified size.
+ explicit HeaderBlock(const Filesystem* filesystem, int block_size)
+ : HeaderBlock(filesystem, std::make_unique<uint8_t[]>(block_size),
+ block_size) {
+ std::memset(header_buffer_.get(), 0, block_size);
+ }
+
+ Header* header() const {
+ return reinterpret_cast<Header*>(header_buffer_.get());
+ }
+
+ // Add another entry to the index_block_infos array and return a pointer to
+ // that entry. Returns a nullptr if the index_block_infos array is already
+ // at a max size.
+ Header::IndexBlockInfo* AddIndexBlockInfo() {
+ if (size() + sizeof(Header::IndexBlockInfo) > block_size_) {
+ return nullptr;
+ }
+ ++header()->num_index_block_infos;
+ return header()->index_block_infos + (header()->num_index_block_infos - 1);
+ }
+
+ // Returns the size of the header block currently in use.
+ int size() const {
+ return sizeof(Header) +
+ header()->num_index_block_infos * sizeof(Header::IndexBlockInfo);
+ }
+
+ // Writes the header to fd. Returns true on success.
+ bool Write(int fd) {
+ return filesystem_->PWrite(fd, 0, header_buffer_.get(), block_size_);
+ }
+
+ private:
+ explicit HeaderBlock(const Filesystem* filesystem,
+ std::unique_ptr<uint8_t[]> buffer, int block_size)
+ : filesystem_(filesystem),
+ header_buffer_(std::move(buffer)),
+ block_size_(block_size) {}
+
+ const Filesystem* filesystem_; // does NOT own!
+ std::unique_ptr<uint8_t[]> header_buffer_;
+ int block_size_;
+};
+static_assert(16 == sizeof(HeaderBlock::Header),
+ "Header has changed size. Consider how this change might affect "
+ "pre-existing indices.");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_
diff --git a/icing/index/main/flash-index-storage.cc b/icing/index/main/flash-index-storage.cc
new file mode 100644
index 0000000..b88d7fe
--- /dev/null
+++ b/icing/index/main/flash-index-storage.cc
@@ -0,0 +1,511 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/flash-index-storage.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <unordered_set>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/index/main/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/math-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+uint32_t SelectBlockSize() {
+ // This should be close to the flash page size.
+ static constexpr uint32_t kMinBlockSize = 4096;
+
+ // Determine a good block size.
+ uint32_t page_size = getpagesize();
+ uint32_t block_size = std::max(kMinBlockSize, page_size);
+
+ // Align up to the nearest page size.
+ return math_util::RoundUpTo(block_size, page_size);
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create(
+ const std::string& index_filename, const Filesystem* filesystem,
+ bool in_memory) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+ FlashIndexStorage storage(index_filename, filesystem, in_memory);
+ if (!storage.Init()) {
+ return absl_ports::InternalError(
+ "Unable to successfully read header block!");
+ }
+ return storage;
+}
+
+FlashIndexStorage::FlashIndexStorage(const std::string& index_filename,
+ const Filesystem* filesystem,
+ bool has_in_memory_freelists)
+ : index_filename_(index_filename),
+ num_blocks_(0),
+ filesystem_(filesystem),
+ has_in_memory_freelists_(has_in_memory_freelists) {}
+
+FlashIndexStorage::~FlashIndexStorage() {
+ if (header_block_ != nullptr) {
+ FlushInMemoryFreeList();
+ PersistToDisk();
+ }
+}
+
+bool FlashIndexStorage::Init() {
+ block_fd_ = ScopedFd(filesystem_->OpenForWrite(index_filename_.c_str()));
+ if (!block_fd_.is_valid()) {
+ return false;
+ }
+
+ // Read in or create the header.
+ return InitHeader();
+}
+
+bool FlashIndexStorage::InitHeader() {
+ // Look for an existing file size.
+ int64_t file_size = filesystem_->GetFileSize(block_fd_.get());
+ if (file_size == Filesystem::kBadFileSize) {
+ ICING_LOG(ERROR) << "Could not initialize main index. Bad file size.";
+ return false;
+ }
+
+ if (file_size == 0) {
+ if (!CreateHeader()) {
+ ICING_LOG(ERROR)
+ << "Could not initialize main index. Unable to create header.";
+ return false;
+ }
+ } else {
+ if (!OpenHeader(file_size)) {
+ ICING_LOG(ERROR)
+ << "Could not initialize main index. Unable to open header.";
+ return false;
+ }
+ }
+ in_memory_freelists_.resize(header_block_->header()->num_index_block_infos);
+
+ return true;
+}
+
+bool FlashIndexStorage::CreateHeader() {
+ uint32_t block_size = SelectBlockSize();
+ header_block_ = std::make_unique<HeaderBlock>(filesystem_, block_size);
+ // Initialize.
+ header_block_->header()->magic = HeaderBlock::Header::kMagic;
+ header_block_->header()->block_size = block_size;
+ header_block_->header()->last_indexed_docid = kInvalidDocumentId;
+
+ // Work down from the largest posting list that fits in
+ // block_size. We don't care about locality of blocks because this
+ // is a flash index.
+ for (uint32_t posting_list_bytes =
+ IndexBlock::CalculateMaxPostingListBytes(block_size);
+ posting_list_bytes >= posting_list_utils::min_posting_list_size();
+ posting_list_bytes /= 2) {
+ uint32_t aligned_posting_list_bytes =
+ (posting_list_bytes / sizeof(Hit) * sizeof(Hit));
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Block size %u: %u", header_block_->header()->num_index_block_infos,
+ aligned_posting_list_bytes);
+
+ // Initialize free list to empty.
+ HeaderBlock::Header::IndexBlockInfo* block_info =
+ header_block_->AddIndexBlockInfo();
+ if (block_info == nullptr) {
+ // This should never happen anyways. Min block size is 4k, so adding these
+ // IndexBlockInfos should never exceed the block size.
+ return false;
+ }
+ block_info->posting_list_bytes = aligned_posting_list_bytes;
+ block_info->free_list_block_index = kInvalidBlockIndex;
+ }
+
+ // Write the header.
+ if (!header_block_->Write(block_fd_.get())) {
+ filesystem_->Truncate(block_fd_.get(), 0);
+ return false;
+ }
+ num_blocks_ = 1;
+ return true;
+}
+
+bool FlashIndexStorage::OpenHeader(int64_t file_size) {
+ uint32_t block_size = SelectBlockSize();
+ // Read and validate header.
+ ICING_ASSIGN_OR_RETURN(
+ HeaderBlock read_header,
+ HeaderBlock::Read(filesystem_, block_fd_.get(), block_size), false);
+ if (read_header.header()->magic != HeaderBlock::Header::kMagic) {
+ ICING_LOG(ERROR) << "Index header block wrong magic";
+ return false;
+ }
+ if (file_size % read_header.header()->block_size != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Index size %" PRIu64 " not a multiple of block size %u", file_size,
+ read_header.header()->block_size);
+ return false;
+ }
+
+ if (file_size < static_cast<int64_t>(read_header.header()->block_size)) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Index size %" PRIu64 " shorter than block size %u", file_size,
+ read_header.header()->block_size);
+ return false;
+ }
+
+ if (read_header.header()->block_size % getpagesize() != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Block size %u is not a multiple of page size %d",
+ read_header.header()->block_size, getpagesize());
+ return false;
+ }
+ num_blocks_ = file_size / read_header.header()->block_size;
+ if (block_size != read_header.header()->block_size) {
+ // The block_size changed? That's weird. But the old block_size is still
+ // valid (it must be some multiple of the new block_size). So reinitialize
+ // with that old block size. Using the old block size means that we can
+ // still use the main index, but reads/writes won't be as efficient in terms
+ // of flash IO because the 'blocks' that we're reading are actually multiple
+ // pages long.
+ ICING_LOG(ERROR) << "Block size of existing header ("
+ << read_header.header()->block_size
+ << ") does not match the requested block size ("
+ << block_size << "). Defaulting to existing block size "
+ << read_header.header()->block_size;
+ ICING_ASSIGN_OR_RETURN(HeaderBlock read_header,
+ HeaderBlock::Read(filesystem_, block_fd_.get(),
+ read_header.header()->block_size),
+ false);
+ }
+ header_block_ = std::make_unique<HeaderBlock>(std::move(read_header));
+
+ // Check for memory alignment on posting_list_bytes. See b/29983315.
+ // The issue of potential corruption to the header could also be handled by
+ // checksumming the header block.
+ for (int i = 0; i < header_block_->header()->num_index_block_infos; ++i) {
+ int posting_list_bytes =
+ header_block_->header()->index_block_infos[i].posting_list_bytes;
+ if (posting_list_bytes % sizeof(Hit) != 0) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Posting list size misaligned, index %u, size %u, hit %zu, "
+ "file_size %" PRIu64,
+ i, header_block_->header()->index_block_infos[i].posting_list_bytes,
+ sizeof(Hit), file_size);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool FlashIndexStorage::PersistToDisk() {
+ // First, write header.
+ if (!header_block_->Write(block_fd_.get())) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Write index header failed: %s", strerror(errno));
+ return false;
+ }
+
+ // Then sync.
+ return filesystem_->DataSync(block_fd_.get());
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingList(PostingListIdentifier id) const {
+ ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(id.block_index()));
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list,
+ block.GetAllocatedPostingList(id.posting_list_index()));
+ PostingListHolder holder = {std::move(posting_list), std::move(block), id};
+ return holder;
+}
+
+libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::GetIndexBlock(
+ int block_index) const {
+ if (block_index >= num_blocks_) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Unable to create an index block at index %d when only %d blocks have "
+ "been allocated.",
+ block_index, num_blocks_));
+ }
+ off_t offset = static_cast<off_t>(block_index) * block_size();
+ return IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ *filesystem_, index_filename_, offset, block_size());
+}
+
+libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock(
+ int block_index, uint32_t posting_list_size) const {
+ if (block_index >= num_blocks_) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Unable to create an index block at index %d when only %d blocks have "
+ "been allocated.",
+ block_index, num_blocks_));
+ }
+ off_t offset = static_cast<off_t>(block_index) * block_size();
+ return IndexBlock::CreateFromUninitializedRegion(
+ *filesystem_, index_filename_, offset, block_size(), posting_list_size);
+}
+
+int FlashIndexStorage::FindBestIndexBlockInfo(
+ uint32_t posting_list_bytes) const {
+ int i = header_block_->header()->num_index_block_infos - 1;
+ for (; i >= 0; i--) {
+ if (header_block_->header()->index_block_infos[i].posting_list_bytes >=
+ posting_list_bytes) {
+ return i;
+ }
+ }
+ return i;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingListFromInMemoryFreeList(int block_info_index) {
+ // Get something from in memory free list.
+ ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id,
+ in_memory_freelists_[block_info_index].TryPop());
+ // Remember, posting lists stored on the in-memory free list were never
+ // actually freed. So it will still contain a valid PostingListUsed. First, we
+ // need to free this posting list.
+ ICING_ASSIGN_OR_RETURN(IndexBlock block,
+ GetIndexBlock(posting_list_id.block_index()));
+ block.FreePostingList(posting_list_id.posting_list_index());
+
+ // Now, we can allocate a posting list from the same index block. It may not
+ // be the same posting list that was just freed, but that's okay.
+ ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+ block.AllocatePostingList());
+ posting_list_id =
+ PostingListIdentifier(posting_list_id.block_index(), posting_list_index,
+ posting_list_id.posting_list_index_bits());
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list,
+ block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+ PostingListHolder holder = {std::move(posting_list), std::move(block),
+ posting_list_id};
+ return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingListFromOnDiskFreeList(int block_info_index) {
+ // Get something from the free list.
+ uint32_t block_index = header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index;
+ if (block_index == kInvalidBlockIndex) {
+ return absl_ports::NotFoundError("No available entry in free list.");
+ }
+
+ // Get the index block
+ ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(block_index));
+ ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+ block.AllocatePostingList());
+ PostingListIdentifier posting_list_id = PostingListIdentifier(
+ block_index, posting_list_index, block.posting_list_index_bits());
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list,
+ block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+ if (!block.has_free_posting_lists()) {
+ RemoveFromOnDiskFreeList(block_index, block_info_index, &block);
+ }
+ PostingListHolder holder = {std::move(posting_list), std::move(block),
+ posting_list_id};
+ return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocateNewPostingList(int block_info_index) {
+ uint32_t block_index = GrowIndex();
+ if (block_index == kInvalidBlockIndex) {
+ return absl_ports::ResourceExhaustedError(
+ "Unable to grow the index further!");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ IndexBlock block,
+ CreateIndexBlock(block_index, header_block_->header()
+ ->index_block_infos[block_info_index]
+ .posting_list_bytes));
+ ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index,
+ block.AllocatePostingList());
+ PostingListIdentifier posting_list_id = PostingListIdentifier(
+ block_index, posting_list_index, block.posting_list_index_bits());
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list,
+ block.GetAllocatedPostingList(posting_list_id.posting_list_index()));
+ if (block.has_free_posting_lists()) {
+ AddToOnDiskFreeList(block_index, block_info_index, &block);
+ }
+ PostingListHolder holder = {std::move(posting_list), std::move(block),
+ posting_list_id};
+ return holder;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocatePostingList(uint32_t min_posting_list_bytes) {
+ int max_block_size = IndexBlock::CalculateMaxPostingListBytes(block_size());
+ if (min_posting_list_bytes > max_block_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d exceeds max posting list size %d",
+ min_posting_list_bytes, max_block_size));
+ }
+ int best_block_info_index = FindBestIndexBlockInfo(min_posting_list_bytes);
+
+ auto holder_or = GetPostingListFromInMemoryFreeList(best_block_info_index);
+ if (holder_or.ok()) {
+ return std::move(holder_or).ValueOrDie();
+ }
+
+ // Nothing in memory. Look for something in the block file.
+ holder_or = GetPostingListFromOnDiskFreeList(best_block_info_index);
+ if (holder_or.ok()) {
+ return std::move(holder_or).ValueOrDie();
+ }
+
+ return AllocateNewPostingList(best_block_info_index);
+}
+
+void FlashIndexStorage::AddToOnDiskFreeList(uint32_t block_index,
+ int block_info_index,
+ IndexBlock* index_block) {
+ index_block->set_next_block_index(header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index);
+ header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index = block_index;
+}
+
+void FlashIndexStorage::RemoveFromOnDiskFreeList(uint32_t block_index,
+ int block_info_index,
+ IndexBlock* index_block) {
+ // Cannot be used anymore. Move free ptr to the next block.
+ header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index = index_block->next_block_index();
+ index_block->set_next_block_index(kInvalidBlockIndex);
+}
+
+void FlashIndexStorage::FreePostingList(PostingListHolder holder) {
+ uint32_t posting_list_bytes = holder.block.get_posting_list_bytes();
+ int best_block_info_index = FindBestIndexBlockInfo(posting_list_bytes);
+
+ // It *should* be guaranteed elsewhere that FindBestIndexBlockInfo will not
+ // return a value in >= in_memory_freelists_, but check regardless. If it
+ // doesn't fit for some reason, then put it in the Header free list instead.
+ if (has_in_memory_freelists_ &&
+ best_block_info_index < in_memory_freelists_.size()) {
+ in_memory_freelists_[best_block_info_index].Push(holder.id);
+ } else {
+ bool was_full = !holder.block.has_free_posting_lists();
+ holder.block.FreePostingList(holder.id.posting_list_index());
+ // If this block was not already full, then it is already in the free list.
+ if (was_full) {
+ AddToOnDiskFreeList(holder.id.block_index(), best_block_info_index,
+ &holder.block);
+ }
+ }
+}
+
+int FlashIndexStorage::GrowIndex() {
+ if (num_blocks_ >= kMaxBlockIndex) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf("Reached max block index %u",
+ kMaxBlockIndex);
+ return kInvalidBlockIndex;
+ }
+
+ // Grow the index file.
+ if (!filesystem_->Grow(
+ block_fd_.get(),
+ static_cast<uint64_t>(num_blocks_ + 1) * block_size())) {
+ ICING_VLOG(1) << IcingStringUtil::StringPrintf(
+ "Error growing index file: %s", strerror(errno));
+ return kInvalidBlockIndex;
+ }
+
+ return num_blocks_++;
+}
+
+void FlashIndexStorage::FlushInMemoryFreeList() {
+ for (int i = 0; i < in_memory_freelists_.size(); ++i) {
+ FreeList& freelist = in_memory_freelists_.at(i);
+ auto freelist_elt_or = freelist.TryPop();
+ while (freelist_elt_or.ok()) {
+ PostingListIdentifier freelist_elt = freelist_elt_or.ValueOrDie();
+ // Remember, posting lists stored on the in-memory free list were never
+ // actually freed. So it will still contain a valid PostingListUsed.
+ // First, we need to free this posting list.
+ auto block_or = GetIndexBlock(freelist_elt.block_index());
+ if (!block_or.ok()) {
+ // Can't read the block. Nothing to do here. This posting list will have
+ // to leak. Just proceed to the next freelist element.
+ freelist_elt_or = freelist.TryPop();
+ continue;
+ }
+ IndexBlock block = std::move(block_or).ValueOrDie();
+ bool was_full = !block.has_free_posting_lists();
+ block.FreePostingList(freelist_elt.posting_list_index());
+ // If this block was not already full, then it is already in the free
+ // list.
+ if (was_full) {
+ AddToOnDiskFreeList(freelist_elt.block_index(), /*block_info_index=*/i,
+ &block);
+ }
+ freelist_elt_or = freelist.TryPop();
+ }
+ }
+}
+
+// FreeList.
+void FlashIndexStorage::FreeList::Push(PostingListIdentifier id) {
+ if (free_list_.size() >= kMaxSize) {
+ ICING_LOG(WARNING)
+ << "Freelist for posting lists of size (block_size / "
+ << (1u << id.posting_list_index_bits())
+ << ") has reached max size. Dropping freed posting list [block_index:"
+ << id.block_index()
+ << ", posting_list_index:" << id.posting_list_index() << "]";
+ return;
+ }
+
+ free_list_.push_back(id);
+}
+
+libtextclassifier3::StatusOr<PostingListIdentifier>
+FlashIndexStorage::FreeList::TryPop() {
+ if (free_list_.empty()) {
+ return absl_ports::NotFoundError("No available entry in free list.");
+ }
+
+ PostingListIdentifier id = free_list_.back();
+ free_list_.pop_back();
+ return id;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h
new file mode 100644
index 0000000..958f131
--- /dev/null
+++ b/icing/index/main/flash-index-storage.h
@@ -0,0 +1,275 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_
+#define ICING_INDEX_FLASH_INDEX_STORAGE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/main/flash-index-storage-header.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// The PostingListHolder struct exists to group together related PostingListUsed
+// IndexBlock pairs and their ids.
+struct PostingListHolder {
+ // PostingListUseds interpret data that they themselves do NOT own. The data
+ // being interpreted is stored on a flash block and its memory mapping is
+ // owned by the IndexBlock. As such, the lifecycle of the PostingListUsed must
+ // NOT exceed the lifecycle of the IndexBlock.
+ PostingListUsed posting_list;
+ IndexBlock block;
+ // The PostingListIdentifier, which identifies both the IndexBlock and the
+ // PostingListUsed, is also returned for convenience.
+ PostingListIdentifier id;
+};
+
+// The FlashIndexStorage class manages the actual file that makes up the index.
+// It allocates IndexBlocks as needed and maintains freelists to prevent
+// excessive block fragmentation.
+//
+// It maintains two types of free lists:
+// 1. On-disk, Header free list - This free list is stored in the Header
+// block. There is a free list for every possible posting list size. Each
+// entry for a posting list size contains the block_index of the
+// IndexBlock that starts the free list chain. Each IndexBlock in the free
+// list chain stores the index of the next IndexBlock in the chain.
+// 2. In-memory free list - Like the Header free list, there is a free list of
+// every possible posting list size. This free list contains not just the
+// block_index of the available IndexBlock, but also the posting_list_index
+// of the available PostingListUsed within the IndexBlock. This is because,
+// unlike the Header free list, PostingListUseds are not actually freed
+// when added to this free list.
+//
+// Whether or not the in-memory free list is used can be chosen via the
+// in_memory param to the Create factory function.
+//
+// The advantage of using the in-memory free list is that it reduces the amount
+// of flash writes made while editing the index (because actually freeing the
+// PostingLists would require writing to that flash block). The disadvantage is
+// that it introduces code complexity and potentially leaks blocks if power is
+// lost or if FlashIndexStorage is destroyed before emptying the free list.
+class FlashIndexStorage {
+ public:
+ // Creates a FlashIndexStorage at index_filename. in_memory determines whether
+ // or not the FlashIndexStorage maintains an in-memory freelist in order to
+ // avoid writes to the on-disk freelist.
+ //
+ // RETURNS:
+ // - On success, a valid instance of FlashIndexStorage
+ // - INTERNAL error if unable to create a new header or read the existing
+ // one from disk.
+ static libtextclassifier3::StatusOr<FlashIndexStorage> Create(
+ const std::string& index_filename, const Filesystem* filesystem,
+ bool in_memory = true);
+
+ // Retrieve the PostingList referred to by PostingListIdentifier. This posting
+ // list must have been previously allocated by a prior call to
+ // AllocatePostingList.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListHolder containing the
+ // requested PostingListUsed.
+ // - INVALID_ARGUMENT if id.posting_list_index() is out of bounds in the
+ // IndexBlock referred to by id.block_index()
+ // - INTERNAL_ERROR if unable to access the region in file.
+ libtextclassifier3::StatusOr<PostingListHolder> GetPostingList(
+ PostingListIdentifier id) const;
+
+ // Allocates and returns a PostingListHolder containing a PostingListUsed that
+ // can fit min_posting_list_bytes.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListHolder containing the
+ // requested PostingListUsed.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to create a
+ // PostingListUsed of the requested size.
+ libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList(
+ uint32_t min_posting_list_bytes);
+
+ ~FlashIndexStorage();
+ FlashIndexStorage(FlashIndexStorage&&) = default;
+ FlashIndexStorage(const FlashIndexStorage&) = delete;
+ FlashIndexStorage& operator=(FlashIndexStorage&&) = default;
+ FlashIndexStorage& operator=(const FlashIndexStorage&) = delete;
+
+ // Free the PostingListUsed that this holder holds.
+ void FreePostingList(PostingListHolder holder);
+
+ // Used to track the largest docid indexed in the index.
+ DocumentId get_last_indexed_docid() const {
+ return header_block_->header()->last_indexed_docid;
+ }
+ void set_last_indexed_docid(DocumentId docid) {
+ header_block_->header()->last_indexed_docid = docid;
+ }
+
+ // Updates the header and persists all changes to the index to disk. Returns
+ // true on success.
+ bool PersistToDisk();
+
+ // Returns the size of the index file in bytes.
+ int64_t GetDiskUsage() const {
+ return filesystem_->GetDiskUsage(block_fd_.get());
+ }
+
+ int num_blocks() const { return num_blocks_; }
+
+ // Info about the index based on the block size.
+ int block_size() const { return header_block_->header()->block_size; }
+
+ // Num blocks starts at 1 since the first block is the header.
+ bool empty() const { return num_blocks_ <= 1; }
+
+ // The percentage of the maximum index size that is free. Allocated blocks are
+ // treated as fully used, even if they are only partially used. In this way,
+ // min_free_fraction is a lower bound of available space.
+ double min_free_fraction() const {
+ return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex;
+ }
+
+ private:
+ FlashIndexStorage(const std::string& index_filename,
+ const Filesystem* filesystem, bool has_in_memory_freelists);
+
+ // Init the index from persistence. Create if file does not exist. We do not
+ // erase corrupt files.
+ //
+ // Returns false if unable to create a new header or if the existing one is
+ // corrupt.
+ bool Init();
+
+ // Create or open the header block. Returns true on success.
+ bool InitHeader();
+
+ // Create a new header block for an empty index file.
+ bool CreateHeader();
+
+ // Loads the header stored at the beginning of the index file and validates
+ // the values stored in it.
+ bool OpenHeader(int64_t file_size);
+
+ // Add the IndexBlock referred to by block_index in the on-disk free list with
+ // index block_info_index.
+ void AddToOnDiskFreeList(uint32_t block_index, int block_info_index,
+ IndexBlock* index_block);
+
+ // Remove the IndexBlock referred to by block_index from the Header free list
+ // with index block_info_index.
+ void RemoveFromOnDiskFreeList(uint32_t block_index, int block_info_index,
+ IndexBlock* index_block);
+
+ // Returns:
+ // - On success, a valid PostingListHolder created from the first entry of
+ // the in-memory freelist at block_info_index
+ // - NOT_FOUND if there was no entry in the freelist
+ // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be
+ // allocated for some reason.
+ libtextclassifier3::StatusOr<PostingListHolder>
+ GetPostingListFromInMemoryFreeList(int block_info_index);
+
+ // Returns:
+ // - On success, a valid PostingListHolder created from the first entry of
+ // the on-disk freelist at block_info_index
+ // - NOT_FOUND if there was no entry in the freelist
+ // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be
+ // allocated for some reason.
+ libtextclassifier3::StatusOr<PostingListHolder>
+ GetPostingListFromOnDiskFreeList(int block_info_index);
+
+ // Returns:
+ // - On success, a valid PostingListHolder created from a newly allocated
+ // IndexBlock.
+ // - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new
+ // IndexBlock.
+ libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList(
+ int block_info_index);
+
+ // Returns:
+ // - On success, a newly created IndexBlock at block_index with posting
+ // lists of size posting_list_size
+ // - INTERNAL_ERROR if unable to access the region in file representing the
+ // IndexBlock
+ libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock(
+ int block_index, uint32_t posting_list_size) const;
+
+ // Returns:
+ // - On success, the IndexBlock that exists at block_index
+ // - INTERNAL_ERROR if unable to access the region in file representing the
+ // IndexBlock
+ libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(int block_index) const;
+
+ // Add a new block to the end of the file and return its block
+ // index. Returns kInvalidBlockIndex if unable to grow the index file.
+ int GrowIndex();
+
+ // Return the index into index_block_infos of the smallest posting_list free
+ // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds
+ // the max-sized posting list.
+ int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const;
+
+ // Flushes the in-memory free list to disk.
+ void FlushInMemoryFreeList();
+
+ // Underlying filename.
+ std::string index_filename_;
+
+ // We open the index file into this fd.
+ ScopedFd block_fd_;
+ int num_blocks_; // can be inferred from index file size
+
+ std::unique_ptr<HeaderBlock> header_block_;
+
+ // In-memory cache of free posting lists.
+ struct FreeList {
+ // Experimentally determined that high watermark for largest
+ // freelist was ~3500.
+ static constexpr size_t kMaxSize = 4096;
+
+ // Push a new PostingListIdentifier if there is space.
+ void Push(PostingListIdentifier id);
+
+ // Attempt to pop a PostingListIdentifier.
+ //
+ // RETURNS:
+ // - identifier of a free posting list, on success
+ // - NOT_FOUND if there are no free posting lists on this free list.
+ libtextclassifier3::StatusOr<PostingListIdentifier> TryPop();
+
+ private:
+ std::vector<PostingListIdentifier> free_list_;
+ };
+ std::vector<FreeList> in_memory_freelists_;
+
+ const Filesystem* filesystem_; // not owned; can't be null
+
+ bool has_in_memory_freelists_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_FLASH_INDEX_STORAGE_H_
diff --git a/icing/index/main/flash-index-storage_test.cc b/icing/index/main/flash-index-storage_test.cc
new file mode 100644
index 0000000..cf899b3
--- /dev/null
+++ b/icing/index/main/flash-index-storage_test.cc
@@ -0,0 +1,540 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/flash-index-storage.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/hit.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Not;
+
+class FlashIndexStorageTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/test_dir";
+ file_name_ = test_dir_ + "/test_file.idx.index";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ protected:
+ std::string test_dir_;
+ std::string file_name_;
+ Filesystem filesystem_;
+};
+
+TEST_F(FlashIndexStorageTest, CorruptHeader) {
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ }
+ {
+ // Read the valid header - should pass
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ }
+ {
+ // Corrupt the header file by changing pl_bytes
+ ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+ off_t offset = 16;
+ uint32_t pl_bytes = sizeof(Hit) - 1; // This is intentionally invalid
+ filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t));
+ }
+ {
+ // Read the header file - should fail because pl_bytes is not divisible
+ // by sizeof(Hit), which is 5 as of writing
+ ASSERT_THAT(FlashIndexStorage::Create(file_name_, &filesystem_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+ {
+ // Correct the pl_bytes header alignment
+ ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+ off_t offset = 16;
+ uint32_t pl_bytes = 2 * sizeof(Hit); // Should be valid
+ filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t));
+ }
+ {
+ // Read the valid header - should pass
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ }
+
+ // Delete the file
+ filesystem_.DeleteFile(file_name_.c_str());
+}
+
+TEST_F(FlashIndexStorageTest, EmptyStorage) {
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ // An 'empty' FlashIndexStorage should have:
+ // 1. One block allocated for the header
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
+ EXPECT_THAT(flash_index_storage.empty(), IsTrue());
+ // 2. The invalid DocumentId stored in its header
+ EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
+ Eq(kInvalidDocumentId));
+ // 3. It's disk usage should be the equivalent of one block.
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(flash_index_storage.block_size()));
+ }
+ {
+ // Read the valid header. All functions should return the same values.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
+ EXPECT_THAT(flash_index_storage.empty(), IsTrue());
+ EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
+ Eq(kInvalidDocumentId));
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(flash_index_storage.block_size()));
+ }
+}
+
+TEST_F(FlashIndexStorageTest, FreeListInMemory) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free list
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+
+ // 4. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, /*in_memory=*/false));
+
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free list
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+
+ // 4. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
+ PostingListIdentifier id1 = PostingListIdentifier::kInvalid;
+ int half_block_posting_list_size = 0;
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ half_block_posting_list_size = (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free list
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1));
+ }
+
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+ // 4. The FlashIndexStorage should go out of scope and flush the in-memory
+ // posting list to disk
+ }
+
+ {
+ // Recreate the flash index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+
+ {
+ // 5. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder3.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+ }
+}
+
+TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ const int kQuarterBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 4;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder1.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get a PL that is 1/4 block size. Even though a 1/4 block PL could
+ // theoretically fit in the same block, we'll allocate a new one because PLs
+ // on a block are required to be the same size.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should have grown by one block.
+ EXPECT_THAT(posting_list_holder2.id.block_index(),
+ Not(Eq(id1.block_index())));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit));
+ }
+ EXPECT_THAT(posting_list_holder2.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Request another 1/4 block-size posting list. This should NOT grow the
+ // index because there should be three free posting lists on block2.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should have remained the same size as before and the
+ // third posting list holder should use the same block as the second
+ // posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.block_index(),
+ Eq(posting_list_holder2.id.block_index()));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(3 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, AllocateTooLargePostingList) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_));
+
+ // Request a PL that is 2x block size.
+ const int kDoubleBlockSize = flash_index_storage.block_size() * 2;
+ EXPECT_THAT(flash_index_storage.AllocatePostingList(kDoubleBlockSize),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/index-block.cc b/icing/index/main/index-block.cc
index 9d7df3c..652dbc6 100644
--- a/icing/index/main/index-block.cc
+++ b/icing/index/main/index-block.cc
@@ -105,11 +105,12 @@ IndexBlock::IndexBlock(MemoryMappedFile mmapped_block)
posting_lists_start_ptr_(mmapped_block.mutable_region() +
sizeof(BlockHeader)),
block_size_in_bytes_(mmapped_block.region_size()),
- mmapped_block_(std::move(mmapped_block)) {}
+ mmapped_block_(
+ std::make_unique<MemoryMappedFile>(std::move(mmapped_block))) {}
libtextclassifier3::Status IndexBlock::Reset(int posting_list_bytes) {
- ICING_RETURN_IF_ERROR(ValidatePostingListBytes(posting_list_bytes,
- mmapped_block_.region_size()));
+ ICING_RETURN_IF_ERROR(ValidatePostingListBytes(
+ posting_list_bytes, mmapped_block_->region_size()));
header_->free_list_posting_list_index = kInvalidPostingListIndex;
header_->next_block_index = kInvalidBlockIndex;
header_->posting_list_bytes = posting_list_bytes;
diff --git a/icing/index/main/index-block.h b/icing/index/main/index-block.h
index 1d17e34..edf9a79 100644
--- a/icing/index/main/index-block.h
+++ b/icing/index/main/index-block.h
@@ -20,6 +20,7 @@
#include <algorithm>
#include <limits>
+#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
@@ -95,6 +96,12 @@ class IndexBlock {
IndexBlock(IndexBlock&&) = default;
IndexBlock& operator=(IndexBlock&&) = default;
+ ~IndexBlock() {
+ if (mmapped_block_ != nullptr) {
+ mmapped_block_->PersistToDisk();
+ }
+ }
+
// Instantiate a PostingListUsed at posting_list_index with the existing
// content in the IndexBlock.
//
@@ -206,7 +213,7 @@ class IndexBlock {
uint32_t block_size_in_bytes_;
// MemoryMappedFile used to interact with the underlying flash block.
- MemoryMappedFile mmapped_block_;
+ std::unique_ptr<MemoryMappedFile> mmapped_block_;
};
} // namespace lib
diff --git a/icing/index/main/main-index-merger.cc b/icing/index/main/main-index-merger.cc
new file mode 100644
index 0000000..724cf43
--- /dev/null
+++ b/icing/index/main/main-index-merger.cc
@@ -0,0 +1,225 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/main-index-merger.h"
+
+#include <cstring>
+#include <memory>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+class HitSelector {
+ public:
+ // Returns whether or not term_id_hit_pair has the same term_id, document_id and section_id
+ // as the previously selected hits.
+ bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) {
+ return prev_.term_id() == term_id_hit_pair.term_id() &&
+ prev_.hit().document_id() == term_id_hit_pair.hit().document_id() &&
+ prev_.hit().section_id() == term_id_hit_pair.hit().section_id();
+ }
+
+ // Merges term_id_hit_pair with previously added hits.
+ void SelectIfBetter(const TermIdHitPair& term_id_hit_pair) {
+ if (term_id_hit_pair.hit().is_prefix_hit()) {
+ SelectPrefixHitIfBetter(term_id_hit_pair);
+ } else {
+ SelectExactHitIfBetter(term_id_hit_pair);
+ }
+ prev_ = term_id_hit_pair;
+ }
+
+ // Adds all valid, selected hits to hits starting at position pos in hits.
+ // Returns the offset in hits after the position of the last added hit.
+ // This function may add between 0-2 hits depending on whether the HitSelector
+ // holds both a valid exact hit and a valid prefix hit, one of those or none.
+ size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) {
+ // Given highest scoring prefix/exact hits for a given
+ // term+docid+sectionid, push needed hits into hits array at offset
+ // pos. Return new pos.
+ if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) {
+ // Output both if scores are unequal. Otherwise only exact hit is
+ // sufficient because 1) they have the same scores and 2) any prefix query
+ // will also accept an exact hit.
+ (*hits)[pos++] = best_exact_hit_;
+ if (best_prefix_hit_.hit().score() != best_exact_hit_.hit().score()) {
+ (*hits)[pos++] = best_prefix_hit_;
+ // Ensure sorted.
+ if (best_prefix_hit_.hit() < best_exact_hit_.hit()) {
+ std::swap((*hits)[pos - 1], (*hits)[pos - 2]);
+ }
+ }
+ } else if (best_prefix_hit_.hit().is_valid()) {
+ (*hits)[pos++] = best_prefix_hit_;
+ } else if (best_exact_hit_.hit().is_valid()) {
+ (*hits)[pos++] = best_exact_hit_;
+ }
+
+ return pos;
+ }
+
+ void Reset() {
+ best_prefix_hit_ = TermIdHitPair();
+ best_exact_hit_ = TermIdHitPair();
+ prev_ = TermIdHitPair();
+ }
+
+ private:
+ void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
+ if (!best_prefix_hit_.hit().is_valid() ||
+ best_prefix_hit_.hit().score() < term_id_hit_pair.hit().score()) {
+ best_prefix_hit_ = term_id_hit_pair;
+ }
+ }
+
+ void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
+ if (!best_exact_hit_.hit().is_valid() ||
+ best_exact_hit_.hit().score() < term_id_hit_pair.hit().score()) {
+ best_exact_hit_ = term_id_hit_pair;
+ }
+ }
+
+ TermIdHitPair best_prefix_hit_;
+ TermIdHitPair best_exact_hit_;
+ TermIdHitPair prev_;
+};
+
+// A helper function to dedupe hits stored in hits. Suppose that the lite index
+// contained a single document with two hits in a single prefix section: "foot"
+// and "fool". When expanded, there would be four hits:
+// {"fo", docid0, sectionid0}
+// {"fo", docid0, sectionid0}
+// {"foot", docid0, sectionid0}
+// {"fool", docid0, sectionid0}
+//
+// The first two are duplicates of each other. So, this function will dedupe
+// and shrink hits to be:
+// {"fo", docid0, sectionid0}
+// {"foot", docid0, sectionid0}
+// {"fool", docid0, sectionid0}
+//
+// When duplicates are encountered, we prefer the hit with the highest hit
+// score. If there is both an exact and prefix hit for the same term, we prefer
+// the exact hit, unless they have different scores, in which case we keep both
+// them.
+void DedupeHits(std::vector<TermIdHitPair>* hits) {
+ // Now all terms are grouped together and all hits for a term are sorted.
+ // Merge equivalent hits into one.
+ std::sort(hits->begin(), hits->end(),
+ [](const TermIdHitPair& lhs, const TermIdHitPair& rhs) {
+ return lhs.value() < rhs.value();
+ });
+ size_t current_offset = 0;
+ HitSelector hit_selector;
+ for (const TermIdHitPair& term_id_hit_pair : *hits) {
+ if (!hit_selector.IsEquivalentHit(term_id_hit_pair)) {
+ // We've reached a new hit. Insert the previously selected hits that we
+ // had accumulated and reset to add this new hit.
+ current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
+ hit_selector.Reset();
+ }
+ // Update best exact and prefix hit.
+ hit_selector.SelectIfBetter(term_id_hit_pair);
+ }
+
+ // Push last.
+ current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
+
+ hits->resize(current_offset);
+}
+
+// Based on experiments with full prefix expansion, the multiplier
+// is ~4x.
+constexpr int kAvgPrefixesPerTerm = 4;
+
+} // namespace
+
+libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
+MainIndexMerger::TranslateAndExpandLiteHits(
+ const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
+ const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs) {
+ std::vector<TermIdHitPair> hits;
+ if (lite_index.empty()) {
+ return hits;
+ }
+ // Reserve enough space for the average number of prefixes per term and the
+ // terms themselves.
+ hits.reserve(lite_index.size() * (kAvgPrefixesPerTerm + 1));
+
+ // Translate lite tvis to main tvis.
+ for (const TermIdHitPair& term_id_hit_pair : lite_index) {
+ uint32_t cur_term_id = term_id_hit_pair.term_id();
+ ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
+ term_id_codec.DecodeTermInfo(cur_term_id));
+ Hit hit(term_id_hit_pair.hit());
+
+ // 1. Translate and push original.
+ auto itr =
+ lexicon_merge_outputs.other_tvi_to_main_tvi.find(cur_decoded_term.tvi);
+ if (itr == lexicon_merge_outputs.other_tvi_to_main_tvi.cend()) {
+ // b/37273773
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Trying to translate lite tvi %u that was never added to the lexicon",
+ cur_decoded_term.tvi));
+ }
+ ICING_ASSIGN_OR_RETURN(uint32_t term_id,
+ term_id_codec.EncodeTvi(itr->second, TviType::MAIN));
+ hits.emplace_back(term_id, hit);
+
+ // 2. Expand hits in prefix sections.
+ if (hit.is_in_prefix_section()) {
+ // Hit was in a prefix section. Push prefixes. Turn on prefix bit.
+ auto itr_prefixes =
+ lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.find(
+ cur_decoded_term.tvi);
+ if (itr_prefixes ==
+ lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.end()) {
+ ICING_VLOG(1) << "No necessary prefix expansion for " << cur_decoded_term.tvi;
+ continue;
+ }
+ // The tvis of all prefixes of this hit's term that appear in the main
+ // lexicon are between [prefix_tvis_buf[offset],
+ // prefix_tvis_buf[offset+len]).
+ size_t offset = itr_prefixes->second.first;
+ size_t len = itr_prefixes->second.second;
+ Hit prefix_hit(hit.section_id(), hit.document_id(), hit.score(),
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+ for (; offset < len; ++offset) {
+ // Take the tvi (in the main lexicon) of each prefix term.
+ uint32_t prefix_main_tvi =
+ lexicon_merge_outputs.prefix_tvis_buf[offset];
+ // Convert it to a term_id.
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t prefix_term_id,
+ term_id_codec.EncodeTvi(prefix_main_tvi, TviType::MAIN));
+ // Create add an element for this prefix TermId and prefix Hit to hits.
+ hits.emplace_back(prefix_term_id, prefix_hit);
+ }
+ }
+ }
+ // 3. Remove any duplicate hits.
+ DedupeHits(&hits);
+ return hits;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/main-index-merger.h b/icing/index/main/main-index-merger.h
new file mode 100644
index 0000000..1413a8f
--- /dev/null
+++ b/icing/index/main/main-index-merger.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_
+#define ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/term-id-codec.h"
+
+namespace icing {
+namespace lib {
+
+// Class used to merge hits from the lite_index and lite_lexicon into main_index
+// and main_lexicon.
+class MainIndexMerger {
+ public:
+ // Retrieves all hits in the lite index, translates the term ids of each
+ // LiteIndex::Element and expands prefix hits based on the mapping from
+ // lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
+ // that exceeds the max TermId
+ static libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
+ TranslateAndExpandLiteHits(
+ const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
+ const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_
diff --git a/icing/index/main/main-index-merger_test.cc b/icing/index/main/main-index-merger_test.cc
new file mode 100644
index 0000000..42b3266
--- /dev/null
+++ b/icing/index/main/main-index-merger_test.cc
@@ -0,0 +1,367 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/index/main/main-index-merger.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+#include "icing/index/main/main-index-merger.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::UnorderedElementsAre;
+
+class MainIndexMergerTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(MainIndexMergerTest, TranslateTermNotAdded) {
+ // 1. Index two docs in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=FALSE}
+ // - Doc1 {"fool", is_in_prefix_section=FALSE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_tvi,
+ lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foot_main_tvi = 5;
+
+ // Only create a mapping for 'foot'. Leave out the mapping for 'fool'
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+
+ // 3. TranslateAndExpand should fail because 'fool' doesn't have a main tvi
+ // mapping.
+ ASSERT_THAT(MainIndexMerger::TranslateAndExpandLiteHits(
+ *lite_index_, *term_id_codec_, lexicon_outputs),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(MainIndexMergerTest, PrefixExpansion) {
+ // 1. Index two docs in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=FALSE}
+ // - Doc1 {"fool", is_in_prefix_section=TRUE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_tvi,
+ lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foo_main_tvi = 12;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
+ Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+
+ uint32_t foot_main_tvi = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_main_term_id,
+ term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
+ uint32_t fool_main_tvi = 10;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_main_term_id,
+ term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
+
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ // Map "fool" to it's prefix hit for "foo".
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
+ std::make_pair(0, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
+
+ // 3. TranslateAndExpand should;
+ // a. Translate lite term ids to main term ids based on the map
+ // b. Expand 'fool' to have a hit for 'foo'
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermIdHitPair> expanded_elts,
+ MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
+ lexicon_outputs));
+ EXPECT_THAT(expanded_elts, UnorderedElementsAre(
+ TermIdHitPair(foot_main_term_id, doc0_hit),
+ TermIdHitPair(fool_main_term_id, doc1_hit),
+ TermIdHitPair(foo_term_id, doc1_prefix_hit)));
+}
+
+TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentScores) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+
+ Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
+ Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foo_main_tvi = 12;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_main_term_id,
+ term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
+ // The prefix hit for 'foot' should have the same score as the exact hit for
+ // 'foot'.
+ Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+
+ uint32_t foot_main_tvi = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_main_term_id,
+ term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
+
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ // Map "foot" to it's prefix hit for "foo".
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
+ std::make_pair(0, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
+
+ // 3. TranslateAndExpand should;
+ // a. Translate lite term ids to main term ids based on the map
+ // b. Expand 'foot' to have a hit for 'foo'
+ // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot'
+ // because they have different scores.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermIdHitPair> expanded_elts,
+ MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
+ lexicon_outputs));
+ EXPECT_THAT(
+ expanded_elts,
+ UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
+ TermIdHitPair(foo_main_term_id, foo_doc0_hit),
+ TermIdHitPair(foo_main_term_id, doc0_prefix_hit)));
+}
+
+TEST_F(MainIndexMergerTest, DedupeWithExactSameScores) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+
+ Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
+ Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foo_main_tvi = 12;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_main_term_id,
+ term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
+
+ uint32_t foot_main_tvi = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_main_term_id,
+ term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
+
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ // Map "foot" to it's prefix hit for "foo".
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
+ std::make_pair(0, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
+
+ // 3. TranslateAndExpand should;
+ // a. Translate lite term ids to main term ids based on the map
+ // b. Expand 'foot' to have a hit for 'foo'
+ // c. Keep only the exact hit for 'foo' since they both have the same hit
+ // score.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermIdHitPair> expanded_elts,
+ MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
+ lexicon_outputs));
+ EXPECT_THAT(
+ expanded_elts,
+ UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
+ TermIdHitPair(foo_main_term_id, foo_doc0_hit)));
+}
+
+TEST_F(MainIndexMergerTest, DedupePrefixExpansion) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" "fool" is_in_prefix_section=TRUE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_tvi,
+ lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
+
+ Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
+ Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foo_main_tvi = 12;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
+ // The prefix hit should take the best score - MaxHitScore when merging these
+ // two.
+ Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+
+ uint32_t foot_main_tvi = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_main_term_id,
+ term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
+ uint32_t fool_main_tvi = 10;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_main_term_id,
+ term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
+
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ // Map "fool" to it's prefix hit for "foo" and "foot" to it's prefix hit for
+ // "foo".
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
+ std::make_pair(0, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
+ std::make_pair(1, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
+
+ // 3. TranslateAndExpand should;
+ // a. Translate lite term ids to main term ids based on the map
+ // b. Expand 'foot' and 'fool' to have hits for 'foo'
+ // c. Merge the prefix hits from 'foot' and 'fool', taking the best hit
+ // score.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermIdHitPair> expanded_elts,
+ MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
+ lexicon_outputs));
+ EXPECT_THAT(
+ expanded_elts,
+ UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
+ TermIdHitPair(fool_main_term_id, fool_doc0_hit),
+ TermIdHitPair(foo_term_id, doc0_prefix_hit)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
new file mode 100644
index 0000000..a0297c2
--- /dev/null
+++ b/icing/index/main/main-index.cc
@@ -0,0 +1,497 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/index/main/main-index.h"
+
+#include <cstring>
+#include <memory>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Finds the shortest,valid prefix term with prefix hits in lexicon for which
+// "prefix" is a prefix.
+// Returns a valid FindTermResult with found=true if either:
+// 1. prefix exists as a term in lexicon.
+// 2. the shortest, valid prefix in the lexicon exists and contains prefix
+// hits.
+// Returns a FindTermResult with found=false and undefined values of tvi and
+// exact if no term was found.
+struct FindTermResult {
+ // TVI of the term that was found. Undefined if found=false.
+ uint32_t tvi;
+ // Whether or not a valid term with prefix hits was found.
+ bool found;
+ // Whether or not that term is equal to 'prefix'
+ bool exact;
+};
+FindTermResult FindShortestValidTermWithPrefixHits(
+ const IcingDynamicTrie* lexicon, const std::string& prefix) {
+ // For prefix indexing: when we are doing a prefix match for "prefix", find
+ // the tvi to the equivalent posting list. prefix's own posting list might not
+ // exist but one of its children acts as a proxy.
+ IcingDynamicTrie::PropertyReader hits_in_prefix_section(
+ *lexicon, GetHasHitsInPrefixSectionPropertyId());
+ uint32_t tvi = 0;
+ bool found = false;
+ bool exact = false;
+ for (IcingDynamicTrie::Iterator it(*lexicon, prefix.c_str()); it.IsValid();
+ it.Advance()) {
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id));
+
+ // Posting list id might be invalid if this is also a backfill term.
+ // Suppose that the main index has two pre-existing prefix hits "foot" and
+ // "fool" - it will have a branch point posting list for "foo". Then, let's
+ // suppose that the other index adds hits for "foul", "four" and "far". This
+ // will result in branch points for "fo" and "f".
+ // If "fo" was added before "f", then the iterator would first give us "fo".
+ // "fo" will have an invalid posting_list_id because it hasn't been
+ // backfilled yet, so we need to continue iterating to "foo".
+ if (posting_list_id.is_valid()) {
+ exact = (prefix.size() == strlen(it.GetKey()));
+ tvi = it.GetValueIndex();
+ // Found it. Does it have prefix hits?
+ found = exact || hits_in_prefix_section.HasProperty(tvi);
+ break;
+ }
+ }
+ FindTermResult result = {tvi, found, exact};
+ return result;
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<MainIndex> MainIndex::Create(
+ const std::string& index_filename, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+ ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
+ MainIndex main_index;
+ ICING_RETURN_IF_ERROR(
+ main_index.Init(index_filename, filesystem, icing_filesystem));
+ return main_index;
+}
+
+// TODO(b/139087650) : Migrate off of IcingFilesystem.
+libtextclassifier3::Status MainIndex::Init(
+ const std::string& index_filename, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem) {
+ std::string flash_index_file = index_filename + "-main-index";
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index,
+ FlashIndexStorage::Create(flash_index_file, filesystem));
+ flash_index_storage_ =
+ std::make_unique<FlashIndexStorage>(std::move(flash_index));
+
+ std::string lexicon_file = index_filename + "-main-lexicon";
+ IcingDynamicTrie::RuntimeOptions runtime_options;
+ main_lexicon_ = std::make_unique<IcingDynamicTrie>(
+ lexicon_file, runtime_options, icing_filesystem);
+ IcingDynamicTrie::Options lexicon_options;
+ if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
+ !main_lexicon_->Init()) {
+ return absl_ports::InternalError("Failed to initialize lexicon trie");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
+MainIndex::GetAccessorForExactTerm(const std::string& term) {
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Find(term.c_str(), &posting_list_id)) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Term %s is not present in main lexicon.", term.c_str()));
+ }
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_id));
+ return std::make_unique<PostingListAccessor>(std::move(accessor));
+}
+
+libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult>
+MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
+ bool exact = false;
+ // For prefix indexing: when we are doing a prefix match for
+ // "prefix", find the tvi to the equivalent posting list. prefix's
+ // own posting list might not exist but its shortest child acts as a proxy.
+ //
+ // For example, if there are only two hits in the index are prefix hits for
+ // "bar" and "bat", then both will appear on a posting list for "ba". "b"
+ // won't have a posting list, but "ba" will suffice.
+ IcingDynamicTrie::PropertyReader hits_in_prefix_section(
+ *main_lexicon_, GetHasHitsInPrefixSectionPropertyId());
+ IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix.c_str());
+ if (!main_itr.IsValid()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Term: %s is not present in the main lexicon.", prefix.c_str()));
+ }
+ exact = (prefix.length() == strlen(main_itr.GetKey()));
+
+ if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
+ // Found it, but it doesn't have prefix hits. Exit early. No need to
+ // retrieve the posting list because there's nothing there for us.
+ return libtextclassifier3::Status::OK;
+ }
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_id));
+ GetPrefixAccessorResult result = {std::make_unique<PostingListAccessor>(std::move(pl_accessor)), exact};
+ return result;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) {
+ // Maps new branching points in main lexicon to the term such that
+ // branching_point_term is a prefix of term and there are no terms smaller
+ // than term and greater than branching_point_term.
+ std::string prefix;
+ LexiconMergeOutputs outputs;
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ // If term were inserted in the main lexicon, what new branching would it
+ // create? (It always creates at most one.)
+ int prefix_len = main_lexicon_->FindNewBranchingPrefixLength(
+ other_term_itr.GetKey(), /*utf8=*/true);
+ if (prefix_len <= 0) {
+ continue;
+ }
+ prefix.assign(other_term_itr.GetKey(), prefix_len);
+
+ // Figure out backfill tvi. Might not exist since all children terms could
+ // only contain hits from non-prefix sections.
+ //
+ // Ex. Suppose that the main lexicon contains "foot" and "fool" and that
+ // we're adding "foul". The new branching prefix will be "fo". The backfill
+ // prefix will be "foo" - all hits in prefix section on "foo" will need to
+ // be added to the new "fo" posting list later.
+ FindTermResult result =
+ FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix);
+ if (!result.found || result.exact) {
+ continue;
+ }
+
+ // This is a new prefix that will need backfilling from its next-in-line
+ // posting list. This new prefix will have to have a posting list eventually
+ // so insert a default PostingListIdentifier as a placeholder.
+ uint32_t branching_prefix_tvi;
+ bool new_key;
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_id,
+ &branching_prefix_tvi, false, &new_key)) {
+ return absl_ports::InternalError("Could not insert branching prefix");
+ }
+
+ // Backfills only contain prefix hits by default. So set these here but
+ // could be overridden when adding hits from the other index later.
+ if (!main_lexicon_->SetProperty(branching_prefix_tvi,
+ GetHasNoExactHitsPropertyId()) ||
+ !main_lexicon_->SetProperty(branching_prefix_tvi,
+ GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::InternalError("Setting prefix prop failed");
+ }
+
+ outputs.backfill_map[branching_prefix_tvi] = result.tvi;
+ }
+ return outputs;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon,
+ LexiconMergeOutputs&& outputs) {
+ IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon);
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ uint32_t new_main_tvi;
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Insert(other_term_itr.GetKey(), &posting_list_id,
+ &new_main_tvi,
+ /*replace=*/false)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not insert term: ", other_term_itr.GetKey()));
+ }
+
+ // Copy the properties from the other lexicon over to the main lexicon.
+ uint32_t other_tvi = other_term_itr.GetValueIndex();
+ if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi,
+ new_main_tvi)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not insert term: ", other_term_itr.GetKey()));
+ }
+
+ // Add other to main mapping.
+ outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi);
+ }
+ return std::move(outputs);
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon,
+ LexiconMergeOutputs&& outputs) {
+ IcingDynamicTrie::PropertyReader has_prefix_prop_reader(
+ other_lexicon, GetHasHitsInPrefixSectionPropertyId());
+ if (!has_prefix_prop_reader.Exists()) {
+ return std::move(outputs);
+ }
+ std::string prefix;
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ // Only expand terms that have hits in prefix sections.
+ if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) {
+ continue;
+ }
+
+ // Get prefixes where there is already a branching point in the main
+ // lexicon. We skip prefixes which don't already have a branching point.
+ std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths(
+ other_term_itr.GetKey(), /*utf8=*/true);
+
+ int buf_start = outputs.prefix_tvis_buf.size();
+ // Add prefixes.
+ for (int prefix_length : prefix_lengths) {
+ if (prefix_length <= 0) {
+ continue;
+ }
+
+ prefix.assign(other_term_itr.GetKey(), prefix_length);
+ uint32_t prefix_tvi;
+ bool new_key;
+ PostingListIdentifier posting_list_identifier =
+ PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_identifier,
+ &prefix_tvi, /*replace=*/false, &new_key)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Could not insert prefix: ", prefix));
+ }
+
+ // Prefix tvi will have hits in prefix section.
+ if (!main_lexicon_->SetProperty(prefix_tvi,
+ GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::InternalError(
+ "Setting has hits in prefix section prop failed");
+ }
+
+ // If it hasn't been added by non-prefix term insertions in
+ // AddBackfillBranchPoints and AddTerms, it is a prefix-only term.
+ if (new_key && !main_lexicon_->SetProperty(
+ prefix_tvi, GetHasNoExactHitsPropertyId())) {
+ return absl_ports::InternalError("Setting no exact hits prop failed");
+ }
+
+ outputs.prefix_tvis_buf.push_back(prefix_tvi);
+ }
+
+ // Any prefixes added? Then add to map.
+ if (buf_start < outputs.prefix_tvis_buf.size()) {
+ outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = {
+ buf_start, outputs.prefix_tvis_buf.size() - buf_start};
+ }
+ }
+ return std::move(outputs);
+}
+
+bool MainIndex::CopyProperties(
+ const IcingDynamicTrie::PropertyReadersAll& prop_reader,
+ const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
+ uint32_t new_main_tvi) {
+ for (uint32_t property_id = 0; property_id < prop_reader.size();
+ ++property_id) {
+ if (property_id == GetHasNoExactHitsPropertyId()) {
+ // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits
+ // for this term, then HasNoExactHits needs to be set to false in
+ // main_lexicon. If other_lexicon has no exact hits for this term, then
+ // HasNoExactHits in the main_lexicon should not be modified.
+ if (!prop_reader.HasProperty(property_id, other_tvi) &&
+ !main_lexicon_->ClearProperty(new_main_tvi, property_id)) {
+ ICING_LOG(ERROR) << "Clearing HasNoExactHitsProperty failed";
+ return false;
+ }
+ } else {
+ // If other_lexicon has this property set for this term, then that
+ // property needs to be set for the main_lexicon. If other_lexicon
+ // doesn't have this property set, then the property in the main lexicon
+ // should not be modified.
+ if (prop_reader.HasProperty(property_id, other_tvi) &&
+ !main_lexicon_->SetProperty(new_main_tvi, property_id)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+libtextclassifier3::Status MainIndex::AddHits(
+ const TermIdCodec& term_id_codec,
+ std::unordered_map<uint32_t, uint32_t>&& backfill_map,
+ std::vector<TermIdHitPair>&& hits) {
+ if (hits.empty()) {
+ return libtextclassifier3::Status::OK;
+ }
+ uint32_t cur_term_id = hits[0].term_id();
+ ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
+ term_id_codec.DecodeTermInfo(cur_term_id));
+ // Iterate through all hits. If these hits are for a term that also needs
+ // backfill, then backfill first and then add the new hits.
+ size_t k_start = 0;
+ size_t k_end = 0;
+ while (k_start < hits.size()) {
+ uint32_t term_id = hits[k_end].term_id();
+ while (term_id == cur_term_id && ++k_end < hits.size()) {
+ term_id = hits[k_end].term_id();
+ }
+
+ // Look for backfill.
+ PostingListIdentifier backfill_posting_list_id =
+ PostingListIdentifier::kInvalid;
+ auto itr = backfill_map.find(cur_decoded_term.tvi);
+ if (itr != backfill_map.end()) {
+ const void* value = main_lexicon_->GetValueAtIndex(itr->second);
+ memcpy(&backfill_posting_list_id, value,
+ sizeof(backfill_posting_list_id));
+ backfill_map.erase(itr);
+ }
+ ICING_RETURN_IF_ERROR(AddHitsForTerm(cur_decoded_term.tvi,
+ backfill_posting_list_id,
+ &hits[k_start], k_end - k_start));
+ cur_term_id = term_id;
+ ICING_ASSIGN_OR_RETURN(cur_decoded_term,
+ term_id_codec.DecodeTermInfo(cur_term_id));
+ k_start = k_end;
+ }
+
+ // Now copy remaining backfills.
+ ICING_VLOG(2) << IcingStringUtil::StringPrintf("Remaining backfills %zu",
+ backfill_map.size());
+ for (auto other_tvi_main_tvi_pair : backfill_map) {
+ PostingListIdentifier backfill_posting_list_id =
+ PostingListIdentifier::kInvalid;
+ memcpy(&backfill_posting_list_id,
+ main_lexicon_->GetValueAtIndex(other_tvi_main_tvi_pair.second),
+ sizeof(backfill_posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor hit_accum,
+ PostingListAccessor::Create(flash_index_storage_.get()));
+ ICING_RETURN_IF_ERROR(
+ AddPrefixBackfillHits(backfill_posting_list_id, &hit_accum));
+ PostingListAccessor::FinalizeResult result =
+ PostingListAccessor::Finalize(std::move(hit_accum));
+ if (result.id.is_valid()) {
+ main_lexicon_->SetValueAtIndex(other_tvi_main_tvi_pair.first, &result.id);
+ }
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status MainIndex::AddHitsForTerm(
+ uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
+ const TermIdHitPair* hit_elements, size_t len) {
+ // 1. Create a PostingListAccessor - either from the pre-existing block, if
+ // one exists, or from scratch.
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(tvi),
+ sizeof(posting_list_id));
+ std::unique_ptr<PostingListAccessor> pl_accessor;
+ if (posting_list_id.is_valid()) {
+ if (posting_list_id.block_index() >= flash_index_storage_->num_blocks()) {
+ ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
+ "Index dropped hits. Invalid block index %u >= %u",
+ posting_list_id.block_index(), flash_index_storage_->num_blocks());
+ // TODO(b/159918304) : Consider revising the checksumming strategy in the
+ // main index. Providing some mechanism to check for corruption - either
+ // during initialization or some later time would allow us to avoid
+ // whack-a-mole with odd corruption issues like this one (b/62820689).
+ return absl_ports::InternalError(
+ "Valid posting list has an invalid block index!");
+ }
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor tmp,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_id));
+ pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp));
+ } else {
+ // New posting list.
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor tmp,
+ PostingListAccessor::Create(flash_index_storage_.get()));
+ pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp));
+ }
+
+ // 2. Backfill any hits if necessary.
+ if (backfill_posting_list_id.is_valid()) {
+ ICING_RETURN_IF_ERROR(
+ AddPrefixBackfillHits(backfill_posting_list_id, pl_accessor.get()));
+ }
+
+ // 3. Add all the new hits.
+ for (int i = len - 1; i >= 0; --i) {
+ Hit hit = hit_elements[i].hit();
+ ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(hit));
+ }
+
+ // 4. Finalize this posting list and put its identifier in the lexicon.
+ PostingListAccessor::FinalizeResult result =
+ PostingListAccessor::Finalize(std::move(*pl_accessor));
+ if (result.id.is_valid()) {
+ main_lexicon_->SetValueAtIndex(tvi, &result.id);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
+ PostingListIdentifier backfill_posting_list_id,
+ PostingListAccessor* hit_accum) {
+ ICING_ASSIGN_OR_RETURN(
+ PostingListAccessor backfill_accessor,
+ PostingListAccessor::CreateFromExisting(flash_index_storage_.get(),
+ backfill_posting_list_id));
+ std::vector<Hit> backfill_hits;
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
+ backfill_accessor.GetNextHitsBatch());
+ while (!tmp.empty()) {
+ std::copy(tmp.begin(), tmp.end(), std::back_inserter(backfill_hits));
+ ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor.GetNextHitsBatch());
+ }
+
+ Hit last_added_hit;
+ for (const Hit& hit : backfill_hits) {
+ // Skip hits from non-prefix-enabled sections.
+ if (!hit.is_in_prefix_section()) {
+ continue;
+ }
+
+ // A backfill hit is a prefix hit in a prefix section.
+ const Hit backfill_hit(hit.section_id(), hit.document_id(), hit.score(),
+ /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/true);
+ if (backfill_hit == last_added_hit) {
+ // Skip duplicate values due to overriding of the is_prefix flag.
+ continue;
+ }
+ last_added_hit = backfill_hit;
+ ICING_RETURN_IF_ERROR(hit_accum->PrependHit(backfill_hit));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
new file mode 100644
index 0000000..79378ea
--- /dev/null
+++ b/icing/index/main/main-index.h
@@ -0,0 +1,235 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_
+#define ICING_INDEX_MAIN_MAIN_INDEX_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/posting-list-accessor.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+class MainIndex {
+ public:
+ // RETURNS:
+ // - valid instance of MainIndex, on success.
+ // - INTERNAL error if unable to create the lexicon or flash storage.
+ static libtextclassifier3::StatusOr<MainIndex> Create(
+ const std::string& index_filename, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
+
+ // Get a PostingListAccessor that holds the posting list chain for 'term'.
+ //
+ // RETURNS:
+ // - On success, a valid PostingListAccessor
+ // - NOT_FOUND if term is not present in the main index.
+ libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
+ GetAccessorForExactTerm(const std::string& term);
+
+ // Get a PostingListAccessor for 'prefix'.
+ //
+ // RETURNS:
+ // - On success, a result containing a valid PostingListAccessor.
+ // - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a
+ // prefix are present in the main index.
+ struct GetPrefixAccessorResult {
+ // A PostingListAccessor that holds the posting list chain for the term
+ // that best represents 'prefix' in the main index.
+ std::unique_ptr<PostingListAccessor> accessor;
+ // True if the returned posting list chain is for 'prefix' or false if the
+ // returned posting list chain is for a term for which 'prefix' is a prefix.
+ bool exact;
+ };
+ libtextclassifier3::StatusOr<GetPrefixAccessorResult>
+ GetAccessorForPrefixTerm(const std::string& prefix);
+
+ struct LexiconMergeOutputs {
+ // Maps from main_lexicon tvi for new branching point to the main_lexicon
+ // tvi for posting list whose hits must be backfilled.
+ std::unordered_map<uint32_t, uint32_t> backfill_map;
+
+ // Maps from lexicon tvis to main_lexicon tvis.
+ std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi;
+
+ // Maps from the lexicon tvi to the beginning position in
+ // prefix_tvis_buf and the length.
+ std::unordered_map<uint32_t, std::pair<int, int>>
+ other_tvi_to_prefix_main_tvis;
+
+ // Stores tvis that are mapped to by other_tvi_to_prefix_tvis.
+ std::vector<uint32_t> prefix_tvis_buf;
+ };
+
+ // Merge the lexicon into the main lexicon and populate the data
+ // structures necessary to translate lite tvis to main tvis, track backfilling
+ // and expanding lite terms to prefix terms.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon(
+ const IcingDynamicTrie& other_lexicon) {
+ // Backfill branch points need to be added first so that the backfill_map
+ // can be correctly populated.
+ ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs,
+ AddBackfillBranchPoints(other_lexicon));
+ ICING_ASSIGN_OR_RETURN(outputs,
+ AddTerms(other_lexicon, std::move(outputs)));
+ // Non-backfill branch points need to be added last so that the mapping of
+ // newly added terms to prefix terms can be correctly populated (prefix
+ // terms might be branch points between two new terms or between a
+ // pre-existing term and a new term).
+ ICING_ASSIGN_OR_RETURN(outputs,
+ AddBranchPoints(other_lexicon, std::move(outputs)));
+ return outputs;
+ }
+
+ // Add hits to the main index and backfill from existing posting lists to new
+ // backfill branch points.
+ //
+ // The backfill_map maps from main_lexicon tvi for a newly added branching
+ // point to the main_lexicon tvi for the posting list whose hits must be
+ // backfilled. backfill_map should be populated as part of LexiconMergeOutputs
+ // in MergeLexicon and be blindly passed to this function.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
+ // exceeds the max TermId, is not valid or is not less than pre-existing hits
+ // in the main index.
+ // - INTERNAL_ERROR if unable to mmap necessary IndexBlocks
+ // - RESOURCE_EXHAUSTED error if unable to grow the index
+ libtextclassifier3::Status AddHits(
+ const TermIdCodec& term_id_codec,
+ std::unordered_map<uint32_t, uint32_t>&& backfill_map,
+ std::vector<TermIdHitPair>&& hits);
+
+ private:
+ libtextclassifier3::Status Init(const std::string& index_filename,
+ const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
+
+ // Helpers for merging the lexicon
+ // Add all 'backfill' branch points. Backfill branch points are prefix
+ // branch points that are a prefix of terms that existed in the lexicon
+ // to the merge.
+ //
+ // For example, if the main lexicon only contains "foot" and is then merged
+ // with a lite lexicon containing only "fool", then a backfill branch point
+ // for "foo" will be added to contain prefix hits from both the pre-existing
+ // posting list for "foot" and the new posting list for "fool".
+ //
+ // Populates LexiconMergeOutputs.backfill_map
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints(
+ const IcingDynamicTrie& other_lexicon);
+
+ // Add all terms from the lexicon.
+ //
+ // Populates LexiconMergeOutputs.other_tvi_to_main_tvi
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms(
+ const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
+
+ // Add all branch points for terms added from the lexicon.
+ // For example, if the main lexicon is empty and is then merged with a
+ // lexicon containing only "foot" and "fool", then a branch point for "foo"
+ // will be added to contain prefix hits from both "foot" and "fool".
+ //
+ // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and
+ // LexiconMergeOutputs.prefix_tvis_buf;
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints(
+ const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
+
+ // Copies all properties from old_tvi in the other lexicon to the new_tvi in
+ // the main lexicon.
+ // Returns true on success, false if an IO error is encountered.
+ bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader,
+ const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
+ uint32_t new_main_tvi);
+
+ // Add all hits between [hit_elements, hit_elements + len) to main_index,
+ // updating the entry in the main lexicon at trie_value_index to point to the
+ // resulting posting list. Hits are sorted in descending document id order, so
+ // they should be to posting lists in reverse (starting at hit_elements
+ // + len - 1) and working backwards. Therefore, hit_elements must be in sorted
+ // order.
+ //
+ // trie_value_index may point to a valid posting list id if there is a
+ // pre-existing posting list to append to.
+ //
+ // If backfill_posting_list_id is valid, then the hits from the posting list
+ // identified by backfill_posting_list_id should be added to the new posting
+ // list before the hits in hit_elements.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid
+ // but points out of bounds in the IndexBlock referred to by
+ // id.block_index(), if one of the hits from [hit_elements,hit_elements+len)
+ // is not valid, or if one of the hits from [hit_elements,hit_elements+len)
+ // is not less than the previously added hits.
+ // - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid
+ // but points to an invalid block index or if unable to mmap the IndexBlock.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status AddHitsForTerm(
+ uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
+ const TermIdHitPair* hit_elements, size_t len);
+
+ // Adds all prefix hits or hits from prefix sections present on the posting
+ // list identified by backfill_posting_list_id to hit_accum.
+ //
+ // RETURNS:
+ // - OK, on success
+ // - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the
+ // IndexBlock referred to by id.block_index()
+ // - INTERNAL_ERROR if unable to mmap the block identified by
+ // backfill_posting_list_id or if the posting list identified by
+ // backfill_posting_list_id has been corrupted.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status AddPrefixBackfillHits(
+ PostingListIdentifier backfill_posting_list_id,
+ PostingListAccessor* hit_accum);
+
+ std::unique_ptr<FlashIndexStorage> flash_index_storage_;
+ std::unique_ptr<IcingDynamicTrie> main_lexicon_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_MAIN_INDEX_H_
diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc
new file mode 100644
index 0000000..019b588
--- /dev/null
+++ b/icing/index/main/main-index_test.cc
@@ -0,0 +1,536 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/main-index.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+#include "icing/index/main/main-index-merger.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::NiceMock;
+using ::testing::SizeIs;
+
+std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+}
+
+std::vector<DocHitInfo> GetExactHits(
+ MainIndex* main_index, const std::string& term,
+ SectionIdMask section_mask = kSectionIdMaskAll) {
+ auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>(
+ main_index, term, section_mask);
+ return GetHits(std::move(iterator));
+}
+
+std::vector<DocHitInfo> GetPrefixHits(
+ MainIndex* main_index, const std::string& term,
+ SectionIdMask section_mask = kSectionIdMaskAll) {
+ auto iterator = std::make_unique<DocHitInfoIteratorTermMainPrefix>(
+ main_index, term, section_mask);
+ return GetHits(std::move(iterator));
+}
+
+libtextclassifier3::Status Merge(const LiteIndex& lite_index,
+ const TermIdCodec& term_id_codec,
+ MainIndex* main_index) {
+ ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs,
+ main_index->MergeLexicon(lite_index.lexicon()));
+ ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> elts,
+ MainIndexMerger::TranslateAndExpandLiteHits(
+ lite_index, term_id_codec, outputs));
+ return main_index->AddHits(term_id_codec, std::move(outputs.backfill_map),
+ std::move(elts));
+}
+
+class MainIndexTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(MainIndexTest, MainIndexCreateIOFailure) {
+ // Create the index with mock filesystem. By default, Mock will return false,
+ // so the first attempted file operation will fail.
+ NiceMock<IcingMockFilesystem> mock_filesystem;
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ EXPECT_THAT(
+ MainIndex::Create(main_index_file_name, &filesystem_, &mock_filesystem),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixTermNotFound) {
+ // Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+ EXPECT_THAT(main_index.GetAccessorForPrefixTerm("foo"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=true}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should contain "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index));
+ // GetAccessorForPrefixTerm should return a valid accessor for "foo".
+ EXPECT_THAT(main_index.GetAccessorForPrefixTerm("foo"), IsOk());
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) {
+ // Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+ EXPECT_THAT(main_index.GetAccessorForExactTerm("foo"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForExactReturnsValidAccessor) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foo" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should contain "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index));
+ // GetAccessorForPrefixTerm should return a valid accessor for "foo".
+ EXPECT_THAT(main_index.GetAccessorForExactTerm("foo"), IsOk());
+}
+
+TEST_F(MainIndexTest, MergeIndexToEmpty) {
+ // 1. Index three docs in the Lite Index:
+ // - Doc0 {"foot", "fool", "far" is_in_prefix_section=false}
+ // - Doc1 {"foot", "fool" is_in_prefix_section=true}
+ // - Doc2 {"fool", "far" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
+
+ Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ std::vector<DocHitInfo> hits = GetExactHits(&main_index, "foot");
+ EXPECT_THAT(hits, IsEmpty());
+ hits = GetPrefixHits(&main_index, "fo");
+ EXPECT_THAT(hits, IsEmpty());
+
+ // 3. Merge the index. The main index should contain "fool", "foot"
+ // and "far" as well as a branch points for "foo" and "f". "fa" and "fo"
+ // should not be present because it is not a branch point.
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index));
+ // Get hits from an exact posting list.
+ hits = GetExactHits(&main_index, "foot");
+ // We should get hits for "foot" in doc1 and doc0
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()}),
+ EqualsDocHitInfo(doc0_hit.document_id(),
+ std::vector<SectionId>{doc0_hit.section_id()})));
+
+ // Get hits from a branching point posting list. "fo" should redirect to "foo"
+ hits = GetPrefixHits(&main_index, "fo");
+ // We should get hits for "foot" in doc1 and "fool" in doc1. We shouldn't get
+ // the hits for "foot" in doc0 and "fool" in doc0 and doc2 because they
+ // weren't hits in prefix sections.
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest, MergeIndexToPreexisting) {
+ // 1. Index three docs in the Lite Index:
+ // - Doc0 {"foot", "fool", "far" is_in_prefix_section=false}
+ // - Doc1 {"foot", "fool" is_in_prefix_section=true}
+ // - Doc2 {"fool", "far" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
+
+ Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should contain "fool", "foot"
+ // and "far" as well as a branch points for "foo" and "f". "fa" and "fo"
+ // should not be present because it is not a branch point.
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index));
+
+ // 4. Index two docs in a new Lite Index:
+ // - Doc3 {"foot", "four", "foul", "fall" is_in_prefix_section=false}
+ // - Doc4 {"four", "foul" is_in_prefix_section=true}
+ std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2";
+ LiteIndex::Options options(lite_index_file_name2,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("four", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t four_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("foul", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foul_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("fall", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t fall_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc3_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc3_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc3_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fall_term_id, doc3_hit));
+
+ Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc4_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc4_hit));
+
+ // 3. Merge the index. The main index should now contain "foul", "four"
+ // and "fall", a branch points for "fou" and backfill points for "fo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index));
+ // Get hits from an exact posting list the existed before the merge.
+ std::vector<DocHitInfo> hits = GetExactHits(&main_index, "foot");
+
+ // We should get hits for "foot" in doc3, doc1 and doc0
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc3_hit.document_id(),
+ std::vector<SectionId>{doc3_hit.section_id()}),
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()}),
+ EqualsDocHitInfo(doc0_hit.document_id(),
+ std::vector<SectionId>{doc0_hit.section_id()})));
+ // Get hits from backfill posting list.
+ hits = GetPrefixHits(&main_index, "fo");
+ // We should get hits for "four" and "foul" in doc4 and hits for "foot" and
+ // "fool" in doc1. We shouldn't get the hits for "foot" in doc0 and doc3,
+ // "fool" in doc0 and doc2 or the hits for "four" and "foul" in doc4 because
+ // they weren't hits in prefix sections.
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc4_hit.document_id(),
+ std::vector<SectionId>{doc4_hit.section_id()}),
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest, ExactRetrievedInPrefixSearch) {
+ // 1. Index two docs in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=true}
+ // - Doc1 {"foo" is_in_prefix_section=false}
+ // - Doc2 {"foot" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));
+
+ Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc2_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the lite lexicon. The main lexicon should contain "foot" and
+ // "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index));
+ std::vector<DocHitInfo> hits = GetPrefixHits(&main_index, "foo");
+ // We should get hits for "foo" in doc1 and doc0, but not in doc2 because it
+ // is not a prefix hit.
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()}),
+ EqualsDocHitInfo(doc0_hit.document_id(),
+ std::vector<SectionId>{doc0_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) {
+ // 1. Index two docs in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=true}
+ // - Doc1 {"foo" is_in_prefix_section=false}
+ // - Doc1 {"foo" is_in_prefix_section=true}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));
+
+ Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc2_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the lite lexicon. The main lexicon should contain "foot" and
+ // "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index));
+ std::vector<DocHitInfo> hits = GetExactHits(&main_index, "foo");
+
+ // We should get hits for "foo" in doc2 and doc1, but not in doc0 because it
+ // is not an exact hit.
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc2_hit.document_id(),
+ std::vector<SectionId>{doc2_hit.section_id()}),
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest, SearchChainedPostingLists) {
+ // Index 2048 document with 3 hits in each document. When merged into the main
+ // index, this will 1) lead to a chained posting list and 2) split at least
+ // one document's hits across multiple posting lists.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ for (DocumentId document_id = 0; document_id < 2048; ++document_id) {
+ Hit doc_hit0(/*section_id=*/0, /*document_id=*/document_id,
+ Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0));
+
+ Hit doc_hit1(/*section_id=*/1, /*document_id=*/document_id,
+ Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1));
+
+ Hit doc_hit2(/*section_id=*/2, /*document_id=*/document_id,
+ Hit::kMaxHitScore,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2));
+ }
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MainIndex main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the lite index.
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index));
+ // Get hits for all documents containing "foot" - which should be all of them.
+ std::vector<DocHitInfo> hits = GetExactHits(&main_index, "foot");
+
+ EXPECT_THAT(hits, SizeIs(2048));
+ EXPECT_THAT(hits.front(),
+ EqualsDocHitInfo(2047, std::vector<SectionId>{0, 1, 2}));
+ EXPECT_THAT(hits.back(),
+ EqualsDocHitInfo(0, std::vector<SectionId>{0, 1, 2}));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-accessor.cc b/icing/index/main/posting-list-accessor.cc
new file mode 100644
index 0000000..a4f8ca7
--- /dev/null
+++ b/icing/index/main/posting-list-accessor.cc
@@ -0,0 +1,194 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-accessor.h"
+
+#include <memory>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<PostingListAccessor> PostingListAccessor::Create(
+ FlashIndexStorage *storage) {
+ uint32_t max_posting_list_bytes =
+ IndexBlock::CalculateMaxPostingListBytes(storage->block_size());
+ std::unique_ptr<uint8_t[]> posting_list_buffer_array =
+ std::make_unique<uint8_t[]>(max_posting_list_bytes);
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list_buffer,
+ PostingListUsed::CreateFromUnitializedRegion(
+ posting_list_buffer_array.get(), max_posting_list_bytes));
+ return PostingListAccessor(storage, std::move(posting_list_buffer_array),
+ std::move(posting_list_buffer));
+}
+
+libtextclassifier3::StatusOr<PostingListAccessor>
+PostingListAccessor::CreateFromExisting(
+ FlashIndexStorage *storage,
+ PostingListIdentifier existing_posting_list_id) {
+ // Our posting_list_buffer_ will start as empty.
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, Create(storage));
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage->GetPostingList(existing_posting_list_id));
+ pl_accessor.preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ return pl_accessor;
+}
+
+// Returns the next batch of hits for the provided posting list.
+libtextclassifier3::StatusOr<std::vector<Hit>>
+PostingListAccessor::GetNextHitsBatch() {
+ if (preexisting_posting_list_ == nullptr) {
+ if (has_reached_posting_list_chain_end_) {
+ return std::vector<Hit>();
+ }
+ return absl_ports::FailedPreconditionError(
+ "Cannot retrieve hits from a PostingListAccessor that was not creaated "
+ "from a preexisting posting list.");
+ }
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> batch,
+ preexisting_posting_list_->posting_list.GetHits());
+ uint32_t block_index = preexisting_posting_list_->block.next_block_index();
+ if (block_index != kInvalidBlockIndex) {
+ PostingListIdentifier next_posting_list_id(
+ block_index, /*posting_list_index=*/0,
+ preexisting_posting_list_->block.posting_list_index_bits());
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage_->GetPostingList(next_posting_list_id));
+ preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ } else {
+ has_reached_posting_list_chain_end_ = true;
+ preexisting_posting_list_.reset();
+ }
+ return batch;
+}
+
+libtextclassifier3::Status PostingListAccessor::PrependHit(const Hit &hit) {
+ PostingListUsed &active_pl = (preexisting_posting_list_ != nullptr)
+ ? preexisting_posting_list_->posting_list
+ : posting_list_buffer_;
+ libtextclassifier3::Status status = active_pl.PrependHit(hit);
+ if (!absl_ports::IsResourceExhausted(status)) {
+ return status;
+ }
+ // There is no more room to add hits to this current posting list! Therefore,
+ // we need to either move those hits to a larger posting list or flush this
+ // posting list and create another max-sized posting list in the chain.
+ if (preexisting_posting_list_ != nullptr) {
+ FlushPreexistingPostingList();
+ } else {
+ ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
+ }
+
+ // Re-add hit. Should always fit since we just cleared posting_list_buffer_.
+ // It's fine to explicitly reference posting_list_buffer_ here because there's
+ // no way of reaching this line while preexisting_posting_list_ is still in
+ // use.
+ return posting_list_buffer_.PrependHit(hit);
+}
+
+void PostingListAccessor::FlushPreexistingPostingList() {
+ if (preexisting_posting_list_->block.max_num_posting_lists() == 1) {
+ // If this is a max-sized posting list, then just keep track of the id for
+ // chaining. It'll be flushed to disk when preexisting_posting_list_ is
+ // destructed.
+ prev_block_identifier_ = preexisting_posting_list_->id;
+ } else {
+ // If this is NOT a max-sized posting list, then our hits have outgrown this
+ // particular posting list. Move the hits into the in-memory posting list
+ // and free this posting list.
+ //
+ // Move will always succeed since posting_list_buffer_ is max_pl_bytes.
+ posting_list_buffer_.MoveFrom(&preexisting_posting_list_->posting_list);
+
+ // Now that all the contents of this posting list have been copied, there's
+ // no more use for it. Make it available to be used for another posting
+ // list.
+ storage_->FreePostingList(std::move(*preexisting_posting_list_));
+ }
+ preexisting_posting_list_.reset();
+}
+
+libtextclassifier3::Status PostingListAccessor::FlushInMemoryPostingList() {
+ // We exceeded max_pl_bytes(). Need to flush posting_list_buffer_ and update
+ // the chain.
+ uint32_t max_posting_list_bytes =
+ IndexBlock::CalculateMaxPostingListBytes(storage_->block_size());
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage_->AllocatePostingList(max_posting_list_bytes));
+ holder.block.set_next_block_index(prev_block_identifier_.block_index());
+ prev_block_identifier_ = holder.id;
+ return holder.posting_list.MoveFrom(&posting_list_buffer_);
+}
+
+PostingListAccessor::FinalizeResult PostingListAccessor::Finalize(
+ PostingListAccessor accessor) {
+ if (accessor.preexisting_posting_list_ != nullptr) {
+ // Our hits are already in an existing posting list. Nothing else to do, but
+ // return its id.
+ FinalizeResult result = {libtextclassifier3::Status::OK,
+ accessor.preexisting_posting_list_->id};
+ return result;
+ }
+ if (accessor.posting_list_buffer_.BytesUsed() <= 0) {
+ FinalizeResult result = {absl_ports::InvalidArgumentError(
+ "Can't finalize an empty PostingListAccessor. "
+ "There's nothing to Finalize!"),
+ PostingListIdentifier::kInvalid};
+ return result;
+ }
+ uint32_t posting_list_bytes =
+ accessor.posting_list_buffer_.MinPostingListSizeToFit();
+ if (accessor.prev_block_identifier_.is_valid()) {
+ posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ accessor.storage_->block_size());
+ }
+ auto holder_or = accessor.storage_->AllocatePostingList(posting_list_bytes);
+ if (!holder_or.ok()) {
+ FinalizeResult result = {holder_or.status(),
+ accessor.prev_block_identifier_};
+ return result;
+ }
+ PostingListHolder holder = std::move(holder_or).ValueOrDie();
+ if (accessor.prev_block_identifier_.is_valid()) {
+ holder.block.set_next_block_index(
+ accessor.prev_block_identifier_.block_index());
+ }
+
+ // Move to allocated area. This should never actually return an error. We know
+ // that editor.posting_list() is valid because it wouldn't have successfully
+ // returned by AllocatePostingList if it wasn't. We know posting_list_buffer_
+ // is valid because we created it in-memory. And finally, we know that the
+ // hits from posting_list_buffer_ will fit in editor.posting_list() because we
+ // requested it be at at least posting_list_bytes large.
+ auto status = holder.posting_list.MoveFrom(&accessor.posting_list_buffer_);
+ if (!status.ok()) {
+ FinalizeResult result = {std::move(status),
+ accessor.prev_block_identifier_};
+ return result;
+ }
+ FinalizeResult result = {libtextclassifier3::Status::OK, holder.id};
+ return result;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-accessor.h b/icing/index/main/posting-list-accessor.h
new file mode 100644
index 0000000..e1bb3c0
--- /dev/null
+++ b/icing/index/main/posting-list-accessor.h
@@ -0,0 +1,168 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_POSTING_LIST_ACCESSOR_H_
+#define ICING_INDEX_POSTING_LIST_ACCESSOR_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+
+namespace icing {
+namespace lib {
+
+// This class serves to:
+// 1. Expose PostingListUseds to clients of FlashIndexStorage
+// 2. Ensure the corresponding instance of IndexBlock has the same lifecycle as
+// the instance of PostingListUsed that the client has access to, while
+// not exposing IndexBlock's api surface.
+// 3. Ensure that PostingListUseds can only be freed by calling methods which
+// will also properly maintain the FlashIndexStorage free list and prevent
+// callers from modifying the Posting List after freeing.
+
+// This class is used to provide a simple abstraction for adding hits to posting
+// lists. PostingListAccessor handles 1) selection of properly-sized posting
+// lists for the accumulated hits during Finalize() and 2) chaining of max-sized
+// posting lists.
+class PostingListAccessor {
+ public:
+ // Creates an empty PostingListAccessor.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListAccessor
+ // - INVALID_ARGUMENT error if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<PostingListAccessor> Create(
+ FlashIndexStorage* storage);
+
+ // Create a PostingListAccessor with an existing posting list identified by
+ // existing_posting_list_id.
+ //
+ // The PostingListAccessor will add hits to this posting list until it is
+ // necessary either to 1) chain the posting list (if it is max-sized) or 2)
+ // move its hits to a larger posting list.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListAccessor
+ // - INVALID_ARGUMENT if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<PostingListAccessor> CreateFromExisting(
+ FlashIndexStorage* storage,
+ PostingListIdentifier existing_posting_list_id);
+
+ // Retrieve the next batch of hits for the posting list chain
+ //
+ // RETURNS:
+ // - On success, a vector of hits in the posting list chain
+ // - INTERNAL if called on an instance of PostingListAccessor that was
+ // created via PostingListAccessor::Create, if unable to read the next
+ // posting list in the chain or if the posting list has been corrupted
+ // somehow.
+ libtextclassifier3::StatusOr<std::vector<Hit>> GetNextHitsBatch();
+
+ // Prepend one hit. This may result in flushing the posting list to disk (if
+ // the PostingListAccessor holds a max-sized posting list that is full) or
+ // freeing a pre-existing posting list if it is too small to fit all hits
+ // necessary.
+ //
+ // RETURNS:
+ // - OK, on success
+ // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
+ // previously added hit.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status PrependHit(const Hit& hit);
+
+ struct FinalizeResult {
+ // - OK on success
+ // - INVALID_ARGUMENT if there was no pre-existing posting list and no
+ // hits were added
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a
+ // new posting list.
+ libtextclassifier3::Status status;
+ // Id of the posting list chain that was finalized. Guaranteed to be valid
+ // if status is OK. May be valid if status is non-OK, but previous blocks
+ // were written.
+ PostingListIdentifier id;
+ };
+ // Write all accumulated hits to storage.
+ //
+ // If accessor points to a posting list chain with multiple posting lists in
+ // the chain and unable to write the last posting list in the chain, Finalize
+ // will return the error and also populate id with the id of the
+ // second-to-last posting list.
+ static FinalizeResult Finalize(PostingListAccessor accessor);
+
+ private:
+ explicit PostingListAccessor(
+ FlashIndexStorage* storage,
+ std::unique_ptr<uint8_t[]> posting_list_buffer_array,
+ PostingListUsed posting_list_buffer)
+ : storage_(storage),
+ prev_block_identifier_(PostingListIdentifier::kInvalid),
+ posting_list_buffer_array_(std::move(posting_list_buffer_array)),
+ posting_list_buffer_(std::move(posting_list_buffer)),
+ has_reached_posting_list_chain_end_(false) {}
+
+ // Flushes preexisting_posting_list_ to disk if it's a max-sized posting list
+ // and populates prev_block_identifier.
+ // If it's not a max-sized posting list, moves the contents of
+ // preexisting_posting_list_ to posting_list_buffer_ and frees
+ // preexisting_posting_list_.
+ // Sets preexisting_posting_list_ to nullptr.
+ void FlushPreexistingPostingList();
+
+ // Flushes posting_list_buffer_ to a max-sized posting list on disk, setting
+ // its next pointer to prev_block_identifier_ and updating
+ // prev_block_identifier_ to point to the just-written posting list.
+ libtextclassifier3::Status FlushInMemoryPostingList();
+
+ // Frees all posting lists in the posting list chain starting at
+ // prev_block_identifier_.
+ libtextclassifier3::Status FreePostingListChain();
+
+ FlashIndexStorage* storage_; // Does not own.
+
+ // The PostingListIdentifier of the first max-sized posting list in the
+ // posting list chain or PostingListIdentifier::kInvalid if there is no
+ // posting list chain.
+ PostingListIdentifier prev_block_identifier_;
+
+ // An editor to an existing posting list on disk. If available (non-NULL),
+ // we'll try to add all hits to this posting list. Once this posting list
+ // fills up, we'll either 1) chain it (if a max-sized posting list) and put
+ // future hits in posting_list_buffer_ or 2) copy all of its hits into
+ // posting_list_buffer_ and free this pl (if not a max-sized posting list).
+ // TODO(tjbarron) provide a benchmark to demonstrate the effects that re-using
+ // existing posting lists has on latency.
+ std::unique_ptr<PostingListHolder> preexisting_posting_list_;
+
+ // In-memory posting list used to buffer hits before writing them to the
+ // smallest on-disk posting list that will fit them.
+ // posting_list_buffer_array_ owns the memory region that posting_list_buffer_
+ // interprets. Therefore, posting_list_buffer_array_ must have the same
+ // lifecycle as posting_list_buffer_.
+ std::unique_ptr<uint8_t[]> posting_list_buffer_array_;
+ PostingListUsed posting_list_buffer_;
+
+ bool has_reached_posting_list_chain_end_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_POSTING_LIST_ACCESSOR_H_
diff --git a/icing/index/main/posting-list-accessor_test.cc b/icing/index/main/posting-list-accessor_test.cc
new file mode 100644
index 0000000..8a5ef07
--- /dev/null
+++ b/icing/index/main/posting-list-accessor_test.cc
@@ -0,0 +1,384 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-accessor.h"
+
+#include <cstdint>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/flash-index-storage.h"
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-identifier.h"
+#include "icing/index/main/posting-list-used.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/hit-test-utils.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Lt;
+using ::testing::SizeIs;
+
+TEST(PostingListAccessorStorageTest, HitsAddAndRetrieveProperly) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result.status);
+ EXPECT_THAT(result.id.block_index(), Eq(1));
+ EXPECT_THAT(result.id.posting_list_index(), Eq(0));
+
+ // Retrieve some hits.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(result.id));
+ EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+ EXPECT_THAT(pl_holder.block.next_block_index(), Eq(kInvalidBlockIndex));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ // Add a single hit. This will fit in a min-sized posting list.
+ Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore);
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result1.status);
+ // Should have been allocated to the first block.
+ EXPECT_THAT(result1.id.block_index(), Eq(1));
+ EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Add one more hit. The minimum size for a posting list must be able to fit
+ // at least two hits, so this should NOT cause the previous pl to be
+ // reallocated.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+ result1.id));
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/1);
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit2));
+ PostingListAccessor::FinalizeResult result2 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result2.status);
+ // Should have been allocated to the same posting list as the first hit.
+ EXPECT_THAT(result2.id, Eq(result1.id));
+
+ // The posting list at result2.id should hold all of the hits that have been
+ // added.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(result2.id));
+ EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAre(hit2, hit1)));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ // The smallest posting list size is 15 bytes. The first four hits will be
+ // compressed to one byte each and will be able to fit in the 5 byte padded
+ // region. The last hit will fit in one of the special hits. The posting list
+ // will be ALMOST_FULL and can fit at most 2 more hits.
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result1.status);
+ // Should have been allocated to the first block.
+ EXPECT_THAT(result1.id.block_index(), Eq(1));
+ EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Now let's add some more hits!
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+ result1.id));
+ // The current posting list can fit at most 2 more hits. Adding 12 more hits
+ // should result in these hits being moved to a larger posting list.
+ std::vector<Hit> hits2 = CreateHits(
+ /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/12,
+ /*desired_byte_length=*/1);
+
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result2.status);
+ // Should have been allocated to the second (new) block because the posting
+ // list should have grown beyond the size that the first block maintains.
+ EXPECT_THAT(result2.id.block_index(), Eq(2));
+ EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
+
+ // The posting list at result2.id should hold all of the hits that have been
+ // added.
+ for (const Hit& hit : hits2) {
+ hits1.push_back(hit);
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(result2.id));
+ EXPECT_THAT(pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result1.status);
+ PostingListIdentifier second_block_id = result1.id;
+ // Should have been allocated to the second block, which holds a max-sized
+ // posting list.
+ EXPECT_THAT(second_block_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // Now let's retrieve them!
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(second_block_id));
+ // This pl_holder will only hold a posting list with the hits that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
+ pl_holder.posting_list.GetHits());
+ ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
+ auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
+ EXPECT_THAT(second_block_hits,
+ ElementsAreArray(hits1.rbegin(), first_block_hits_start));
+
+ // Now retrieve all of the hits that were on the first block.
+ uint32_t first_block_id = pl_holder.block.next_block_index();
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage.GetPostingList(pl_id));
+ EXPECT_THAT(
+ pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest,
+ PreexistingMultiBlockReusesBlocksProperly) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result1.status);
+ PostingListIdentifier first_add_id = result1.id;
+ EXPECT_THAT(first_add_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // Now add a couple more hits. These should fit on the existing, not full
+ // second block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage,
+ first_add_id));
+ std::vector<Hit> hits2 = CreateHits(
+ /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/50,
+ /*desired_byte_length=*/1);
+
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_EXPECT_OK(result2.status);
+ PostingListIdentifier second_add_id = result2.id;
+ EXPECT_THAT(second_add_id, Eq(first_add_id));
+
+ // We should be able to retrieve all 5050 hits.
+ for (const Hit& hit : hits2) {
+ hits1.push_back(hit);
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage.GetPostingList(second_add_id));
+ // This pl_holder will only hold a posting list with the hits that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
+ pl_holder.posting_list.GetHits());
+ ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
+ auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
+ EXPECT_THAT(second_block_hits,
+ ElementsAreArray(hits1.rbegin(), first_block_hits_start));
+
+ // Now retrieve all of the hits that were on the first block.
+ uint32_t first_block_id = pl_holder.block.next_block_index();
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage.GetPostingList(pl_id));
+ EXPECT_THAT(
+ pl_holder.posting_list.GetHits(),
+ IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
+}
+
+TEST(PostingListAccessorStorageTest, InvalidHitReturnsInvalidArgument) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ Hit invalid_hit;
+ EXPECT_THAT(pl_accessor.PrependHit(invalid_hit),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore);
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+
+ Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kMaxHitScore);
+ EXPECT_THAT(pl_accessor.PrependHit(hit2),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore);
+ EXPECT_THAT(pl_accessor.PrependHit(hit3),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, NewPostingListNoHitsAdded) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ EXPECT_THAT(result1.status,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListAccessorStorageTest, PreexistingPostingListNoHitsAdded) {
+ std::string test_dir = GetTestTempDir() + "/test_dir";
+ std::string file_name = test_dir + "/test_file.idx.index";
+ Filesystem filesystem;
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name, &filesystem));
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor,
+ PostingListAccessor::Create(&flash_index_storage));
+ Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore);
+ ICING_ASSERT_OK(pl_accessor.PrependHit(hit1));
+ PostingListAccessor::FinalizeResult result1 =
+ PostingListAccessor::Finalize(std::move(pl_accessor));
+ ICING_ASSERT_OK(result1.status);
+
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor2,
+ PostingListAccessor::CreateFromExisting(
+ &flash_index_storage, result1.id));
+ PostingListAccessor::FinalizeResult result2 =
+ PostingListAccessor::Finalize(std::move(pl_accessor2));
+ ICING_ASSERT_OK(result2.status);
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-identifier.cc b/icing/index/main/posting-list-identifier.cc
new file mode 100644
index 0000000..1cdac65
--- /dev/null
+++ b/icing/index/main/posting-list-identifier.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-identifier.h"
+
+namespace icing {
+namespace lib {
+
+PostingListIdentifier PostingListIdentifier::kInvalid(
+ kInvalidBlockIndex, /*posting_list_index=*/0,
+ PostingListIdentifier::kEncodedPostingListIndexBits - 1);
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-identifier.h b/icing/index/main/posting-list-identifier.h
new file mode 100644
index 0000000..4953865
--- /dev/null
+++ b/icing/index/main/posting-list-identifier.h
@@ -0,0 +1,116 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
+#define ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
+
+#include "icing/index/main/index-block.h"
+#include "icing/index/main/posting-list-free.h"
+#include "icing/legacy/index/icing-bit-util.h"
+
+namespace icing {
+namespace lib {
+
+// 1M blocks * 4K page size = 4GB index
+inline constexpr int kBlockIndexBits = 20;
+inline constexpr int kMaxBlockIndex = (1u << kBlockIndexBits) - 1;
+
+// Class used to store information necessary to identify any posting list within
+// the index.
+//
+// The 20 leftmost bits in this identifier encode the block index. The 12
+// rightmost bits encode both the posting list index and the maximum number of
+// bits required to encode a posting list index on that block.
+//
+// Ex. An index block containing a max of 68 posting lists each of size 60
+// bytes (and thus 7 posting list bits), with a block index of 13 and a posting
+// list index of 5.
+// 0000 0000 0000 0000 1101 1111 0000 0101
+// |__________block-index_______|__pad__|_pl-index_|
+//
+// "pad" is some region starting at kEncodedPostingListIndexBits (12) bit and
+// continuing rightward until reaching a terminating "0". This padding encodes
+// the posting list bits value - posting list bits value is the number of bits
+// after the terminating '0' of the "pad" region.
+//
+// This value will eventually be stored in the Main Lexicon.
+class PostingListIdentifier {
+ // 1 bit is wasted to encode max pl index bits so there can be at most 2^11
+ // posting lists per block. Block size would have to be >=40020 bytes for
+ // there to be more than 2K+ posting lists in a block.
+ static constexpr int kEncodedPostingListIndexBits = 12;
+ static_assert(kEncodedPostingListIndexBits + kBlockIndexBits <=
+ 8 * sizeof(uint32_t),
+ "Not enough room in PostingListIdentifier value to encode "
+ "block index and posting list index.");
+
+ public:
+ static PostingListIdentifier kInvalid;
+
+ // 1. block_index - the index of this block within the FlashIndexStorage file
+ // 2. posting_list_index - the index of this posting list within the block
+ // 3. posting_list_index_bits - the number of bits needed to encode the
+ // largest posting_list_index that this block can have.
+ PostingListIdentifier(uint32_t block_index,
+ PostingListIndex posting_list_index,
+ int posting_list_index_bits) {
+ val_ = 0;
+ BITFIELD_OR(val_, /*offset=*/0, /*len=*/posting_list_index_bits,
+ /*val=*/static_cast<uint64_t>(posting_list_index));
+ BITFIELD_OR(
+ val_, /*offset=*/posting_list_index_bits + 1,
+ /*len=*/kEncodedPostingListIndexBits - posting_list_index_bits - 1,
+ /*val=*/~0u);
+ BITFIELD_OR(val_, /*offset=*/kEncodedPostingListIndexBits,
+ /*len=*/kBlockIndexBits,
+ /*val=*/block_index);
+ }
+
+ int block_index() const {
+ return BITFIELD_GET(val_, kEncodedPostingListIndexBits, kBlockIndexBits);
+ }
+
+ PostingListIndex posting_list_index() const {
+ return BITFIELD_GET(val_, 0, posting_list_index_bits());
+ }
+
+ // Returns the maximum number of bits that a posting list index on the block
+ // referred to by block_index could use.
+ int posting_list_index_bits() const {
+ for (int bits = kEncodedPostingListIndexBits - 1; bits >= 0; --bits) {
+ if (((1u << bits) & val_) == 0) {
+ // Got to the zero bit. This is the start of pl index.
+ return bits;
+ }
+ }
+ return -1;
+ }
+
+ bool is_valid() const { return *this != kInvalid; }
+
+ bool operator==(const PostingListIdentifier& rhs) const {
+ return val_ == rhs.val_;
+ }
+ bool operator!=(const PostingListIdentifier& rhs) const {
+ return !(*this == rhs);
+ }
+
+ private:
+ uint32_t val_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_POSTING_LIST_IDENTIFIER_H_
diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc
index 4396007..71752dd 100644
--- a/icing/jni/icing-search-engine-jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc
@@ -302,6 +302,24 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDeleteBySchemaType(
}
JNIEXPORT jbyteArray JNICALL
+Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByQuery(
+ JNIEnv* env, jclass clazz, jlong native_pointer,
+ jbyteArray search_spec_bytes) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(native_pointer);
+
+ icing::lib::SearchSpecProto search_spec_proto;
+ if (!ParseProtoFromJniByteArray(env, search_spec_bytes, &search_spec_proto)) {
+ ICING_LOG(ERROR) << "Failed to parse SearchSpecProto in nativeSearch";
+ return nullptr;
+ }
+ icing::lib::DeleteResultProto delete_result_proto =
+ icing->DeleteByQuery(search_spec_proto);
+
+ return SerializeProtoToJniByteArray(env, delete_result_proto);
+}
+
+JNIEXPORT jbyteArray JNICALL
Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk(
JNIEnv* env, jclass clazz, jlong native_pointer) {
icing::lib::IcingSearchEngine* icing =
diff --git a/icing/legacy/core/icing-string-util.cc b/icing/legacy/core/icing-string-util.cc
index 1954cd3..2eb64ac 100644
--- a/icing/legacy/core/icing-string-util.cc
+++ b/icing/legacy/core/icing-string-util.cc
@@ -11,13 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-// sbanacho@google.com (Scott Banachowski)
-//
-// This is a list of IsGoogleLetter letters. It is copied from
-// google3/util/utf8/proptables/letters.txt CL 19164202.
#include "icing/legacy/core/icing-string-util.h"
#include <stdarg.h>
@@ -34,7 +27,6 @@
namespace icing {
namespace lib {
-namespace {} // namespace
uint32_t IcingStringUtil::UpdateCrc32(uint32_t crc, const char *str, int len) {
if (len > 0) {
crc = ~crc32(~crc, reinterpret_cast<const Bytef *>(str), len);
diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h
index 4ea93ec..767e581 100644
--- a/icing/legacy/core/icing-string-util.h
+++ b/icing/legacy/core/icing-string-util.h
@@ -12,10 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-// sbanacho@google.com (Scott Banachowski)
-
#ifndef ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
#define ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index ee3d3a2..29843ba 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -96,14 +96,28 @@ using std::vector;
namespace icing {
namespace lib {
+namespace {
+constexpr uint32_t kInvalidNodeIndex = (1U << 24) - 1;
+constexpr uint32_t kInvalidNextIndex = ~0U;
+
+// Returns the number of valid nexts in the array.
+int GetValidNextsSize(IcingDynamicTrie::Next *next_array_start,
+ int next_array_length) {
+ int valid_nexts_length = 0;
+ for (; valid_nexts_length < next_array_length &&
+ next_array_start[valid_nexts_length].node_index() != kInvalidNodeIndex;
+ ++valid_nexts_length) {
+ }
+ return valid_nexts_length;
+}
+} // namespace
+
// Based on the bit field widths.
const uint32_t IcingDynamicTrie::Options::kMaxNodes = (1U << 24) - 1;
const uint32_t IcingDynamicTrie::Options::kMaxNexts = (1U << 27) - 1;
const uint32_t IcingDynamicTrie::Options::kMaxSuffixesSize = 1U << 27;
const uint32_t IcingDynamicTrie::Options::kMaxValueSize = 1U << 16;
-const uint32_t IcingDynamicTrie::kInvalidNodeIndex = (1U << 24) - 1;
-const uint32_t IcingDynamicTrie::kInvalidNextIndex = ~0U;
const uint32_t IcingDynamicTrie::kInvalidSuffixIndex = ~0U;
const int IcingDynamicTrie::kMaxNextArraySize;
@@ -891,7 +905,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init(
bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
uint8_t *buf, uint32_t buf_size) const {
- uint32_t size = hdr.ByteSize();
+ uint32_t size = hdr.ByteSizeLong();
if (size + sizeof(kMagic) + sizeof(uint32_t) > buf_size) return false;
memcpy(buf, &kMagic, sizeof(kMagic));
memcpy(buf + sizeof(kMagic), &size, sizeof(uint32_t));
@@ -1502,6 +1516,53 @@ void IcingDynamicTrie::Clear() {
deleted_bitmap_->Truncate(0);
}
+bool IcingDynamicTrie::ClearSuffixAndValue(uint32_t suffix_value_index) {
+ // The size 1 below is for a '\0' between the suffix and the value.
+ size_t suffix_and_value_length =
+ strlen(this->storage_->GetSuffix(suffix_value_index)) + 1 +
+ this->value_size();
+ char *mutable_suffix_and_value = this->storage_->GetMutableSuffix(
+ suffix_value_index, suffix_and_value_length);
+
+ if (mutable_suffix_and_value == nullptr) {
+ return false;
+ }
+
+ memset(mutable_suffix_and_value, 0, suffix_and_value_length);
+ return true;
+}
+
+bool IcingDynamicTrie::ResetNext(uint32_t next_index) {
+ Next *mutable_next =
+ this->storage_->GetMutableNextArray(next_index, /*len=*/1);
+
+ if (mutable_next == nullptr) {
+ return false;
+ }
+
+ mutable_next->set_val(0);
+ mutable_next->set_node_index(kInvalidNodeIndex);
+ return true;
+}
+
+bool IcingDynamicTrie::SortNextArray(const Node *node) {
+ if (node == nullptr) {
+ // Nothing to sort, return success directly.
+ return true;
+ }
+
+ uint32_t next_array_buffer_size = 1u << node->log2_num_children();
+ Next *next_array_start = this->storage_->GetMutableNextArray(
+ node->next_index(), next_array_buffer_size);
+
+ if (next_array_start == nullptr) {
+ return false;
+ }
+
+ std::sort(next_array_start, next_array_start + next_array_buffer_size - 1);
+ return true;
+}
+
bool IcingDynamicTrie::Insert(const char *key, const void *value,
uint32_t *value_index, bool replace,
bool *pnew_key) {
@@ -1641,15 +1702,12 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value,
new_leaf_node->set_log2_num_children(0);
// Figure out the real length of the existing next array.
- Next *cur_next = storage_->GetMutableNextArray(
- best_node->next_index(), 1 << best_node->log2_num_children());
- int next_len = 0;
- for (; next_len < (1 << best_node->log2_num_children()) &&
- cur_next[next_len].node_index() != kInvalidNodeIndex;
- next_len++) {
- }
+ uint32_t next_array_buffer_size = 1u << best_node->log2_num_children();
+ Next *cur_next = storage_->GetMutableNextArray(best_node->next_index(),
+ next_array_buffer_size);
+ int next_len = GetValidNextsSize(cur_next, next_array_buffer_size);
Next *new_next = cur_next;
- if (next_len == (1 << best_node->log2_num_children())) {
+ if (next_len == (next_array_buffer_size)) {
// Allocate a new, larger, array.
new_next = storage_->AllocNextArray(next_len + 1);
memcpy(new_next, cur_next, sizeof(Next) * next_len);
@@ -2072,7 +2130,8 @@ const IcingDynamicTrie::Next *IcingDynamicTrie::LowerBound(
}
void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
- int *key_offset, bool prefix) const {
+ int *key_offset, bool prefix,
+ bool utf8) const {
// Find the best node such that:
//
// - If key is NOT in the trie, key[0..key_offset) is a prefix to
@@ -2093,6 +2152,8 @@ void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
const Node *cur_node = storage_->GetRootNode();
const char *cur_key = key;
+ const Node *utf8_node = cur_node;
+ const char *utf8_key = cur_key;
while (!cur_node->is_leaf()) {
const Next *found = GetNextByChar(cur_node, *cur_key);
if (!found) break;
@@ -2108,12 +2169,101 @@ void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
break;
}
cur_key++;
+
+ if (utf8 && i18n_utils::IsLeadUtf8Byte(*cur_key)) {
+ utf8_node = cur_node;
+ utf8_key = cur_key;
+ }
+ }
+
+ if (utf8) {
+ // Rewind.
+ cur_node = utf8_node;
+ cur_key = utf8_key;
}
*best_node_index = storage_->GetNodeIndex(cur_node);
*key_offset = reinterpret_cast<const char *>(cur_key) - key;
}
+int IcingDynamicTrie::FindNewBranchingPrefixLength(const char *key,
+ bool utf8) const {
+ if (storage_->empty()) {
+ return kNoBranchFound;
+ }
+
+ uint32_t best_node_index;
+ int key_offset;
+ FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true, utf8);
+ const Node *cur_node = storage_->GetNode(best_node_index);
+ const char *cur_key = key + key_offset;
+ if (cur_node->is_leaf()) {
+ // Prefix in the trie. Split at leaf.
+ const char *prev_suffix = storage_->GetSuffix(cur_node->next_index());
+ while (*prev_suffix != '\0' && *prev_suffix == *cur_key) {
+ prev_suffix++;
+ cur_key++;
+ }
+
+ // Equal strings? No branching.
+ if (*prev_suffix == '\0' && *cur_key == '\0') {
+ return kNoBranchFound;
+ }
+
+ if (utf8) {
+ // Rewind to utf8 boundary.
+ size_t offset = i18n_utils::SafeTruncateUtf8Length(key, cur_key - key);
+ cur_key = key + offset;
+ }
+
+ return cur_key - key;
+ } else if (cur_node->log2_num_children() == 0) {
+ // Intermediate node going from no branching to branching.
+ return cur_key - key;
+ }
+
+ // If we've reached this point, then we're already at a branch point. So there
+ // is no *new* branch point.
+ return kNoBranchFound;
+}
+
+std::vector<int> IcingDynamicTrie::FindBranchingPrefixLengths(const char *key,
+ bool utf8) const {
+ std::vector<int> prefix_lengths;
+
+ if (storage_->empty()) {
+ return prefix_lengths;
+ }
+
+ const Node *cur_node = storage_->GetRootNode();
+ const char *cur_key = key;
+ while (*cur_key && !cur_node->is_leaf()) {
+ // Branching prefix?
+ if (cur_node->log2_num_children() > 0) {
+ int len = cur_key - key;
+ if (utf8) {
+ // Do not cut mid-utf8. Walk up to utf8 boundary.
+ len = i18n_utils::SafeTruncateUtf8Length(key, len);
+ if (prefix_lengths.empty() || len != prefix_lengths.back()) {
+ prefix_lengths.push_back(len);
+ }
+ } else {
+ prefix_lengths.push_back(len);
+ }
+ }
+
+ // Move to next.
+ const Next *found = GetNextByChar(cur_node, *cur_key);
+ if (found == nullptr) {
+ break;
+ }
+ cur_node = storage_->GetNode(found->node_index());
+
+ ++cur_key;
+ }
+ return prefix_lengths;
+}
+
void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
Stats stats;
CollectStats(&stats);
@@ -2248,6 +2398,102 @@ bool IcingDynamicTrie::ClearDeleted(uint32_t value_index) {
return deleted_bitmap_->SetBit(idx, false);
}
+// Steps:
+// 1. Find the key in the trie.
+// 2. Remove the suffix and the value.
+// 3. Reset the nexts that point to the nodes to be removed.
+// 4. Sort any next array if needed.
+bool IcingDynamicTrie::Delete(const std::string_view key) {
+ if (!is_initialized()) {
+ ICING_LOG(ERROR) << "DynamicTrie not initialized";
+ return false;
+ }
+
+ if (storage_->empty()) {
+ // Nothing to delete.
+ return true;
+ }
+
+ // Tries to find the key in the trie, starting from the root.
+ const Node *current_node = storage_->GetRootNode();
+
+ // The node after which we start to remove data.
+ const Node *last_multichild_node = nullptr;
+
+ // While visiting the trie nodes, we store the indices of Nexts that point
+ // to all the nodes after last_multichild_node. Those nodes must be
+ // consecutive and all have only one child. Resetting those Nexts means that
+ // we remove the data of the key.
+ std::vector<uint32_t> nexts_to_reset;
+ nexts_to_reset.reserve(key.length());
+
+ // Iterates through chars in the key, finds nodes in the trie until a leaf
+ // node is reached. The max number of loops is key.length() + 1 because we
+ // start from the root.
+ for (size_t i = 0; i <= key.length(); ++i) {
+ if (current_node->is_leaf()) {
+ // Leaf node, now check the suffix.
+ if (key.substr(i) != storage_->GetSuffix(current_node->next_index())) {
+ // Key does not exist in the trie, nothing to delete.
+ return true;
+ }
+ // Otherwise, key is found.
+ break;
+ }
+
+ // Finds the next char.
+ const Next *next;
+ if (i == key.length()) {
+ // When we're at the end of the key, the next char is the termination char
+ // '\0'.
+ next = GetNextByChar(current_node, '\0');
+ } else {
+ next = GetNextByChar(current_node, key[i]);
+ }
+
+ if (next == nullptr) {
+ // Key does not exist in the trie, nothing to delete.
+ return true;
+ }
+
+ // Checks the real size of next array.
+ uint32_t next_array_buffer_size = 1u << current_node->log2_num_children();
+ Next *next_array_start = storage_->GetMutableNextArray(
+ current_node->next_index(), next_array_buffer_size);
+ int valid_next_array_size =
+ GetValidNextsSize(next_array_start, next_array_buffer_size);
+ if (valid_next_array_size == 0) {
+ // Key does not exist in the trie, nothing to delete.
+ // This shouldn't happen, but we put a sanity check here in case something
+ // is wrong.
+ return true;
+ } else if (valid_next_array_size == 1) {
+ // Single-child branch will be deleted.
+ nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
+ } else {
+ // We see a new node with multiple children, all the previously seen nodes
+ // shouldn't be removed.
+ last_multichild_node = current_node;
+ nexts_to_reset.clear();
+ nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
+ }
+
+ // Updates current_node.
+ current_node = storage_->GetNode(next->node_index());
+ }
+ // Now we've found the key in the trie.
+
+ ClearSuffixAndValue(current_node->next_index());
+
+ // Resets nexts to remove key information.
+ for (uint32_t next_index : nexts_to_reset) {
+ ResetNext(next_index);
+ }
+ SortNextArray(last_multichild_node);
+
+ return true;
+}
+
bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
if (!is_initialized()) {
ICING_LOG(FATAL) << "DynamicTrie not initialized";
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index c33be96..7fe290b 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -288,6 +288,16 @@ class IcingDynamicTrie : public IIcingStorage {
// Empty out the trie without closing or removing.
void Clear();
+ // Clears the suffix and value at the given index. Returns true on success.
+ bool ClearSuffixAndValue(uint32_t suffix_value_index);
+
+ // Resets the next at the given index so that it points to no node.
+ // Returns true on success.
+ bool ResetNext(uint32_t next_index);
+
+ // Sorts the next array of the node. Returns true on success.
+ bool SortNextArray(const Node *node);
+
// Sync to disk.
bool Sync() override;
@@ -375,6 +385,16 @@ class IcingDynamicTrie : public IIcingStorage {
bool is_full_match() const { return value_index != kInvalidValueIndex; }
};
+ static constexpr int kNoBranchFound = -1;
+ // Return prefix of any new branches created if key were inserted. If utf8 is
+ // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches
+ // would be created.
+ int FindNewBranchingPrefixLength(const char *key, bool utf8) const;
+
+ // Find all prefixes of key where the trie branches. Excludes the key
+ // itself. If utf8 is true, does not cut key mid-utf8.
+ std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const;
+
void GetDebugInfo(int verbosity, std::string *out) const override;
double min_free_fraction() const;
@@ -402,6 +422,10 @@ class IcingDynamicTrie : public IIcingStorage {
// Clears the deleted property for each value.
bool ClearDeleted(uint32_t value_index);
+ // Deletes the entry associated with the key. Data can not be recovered after
+ // the deletion. Returns true on success.
+ bool Delete(std::string_view key);
+
// Clear a specific property id from all values. For each value that has this
// property cleared, also check to see if it was the only property set; if
// so, set the deleted property for the value to indicate it no longer has any
@@ -575,8 +599,6 @@ class IcingDynamicTrie : public IIcingStorage {
void GetHeader(IcingDynamicTrieHeader *hdr) const;
void SetHeader(const IcingDynamicTrieHeader &new_hdr);
- static const uint32_t kInvalidNodeIndex;
- static const uint32_t kInvalidNextIndex;
static const uint32_t kInvalidSuffixIndex;
// Stats helpers.
@@ -587,7 +609,7 @@ class IcingDynamicTrie : public IIcingStorage {
const Next *LowerBound(const Next *start, const Next *end,
uint8_t key_char) const;
void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
- bool prefix) const;
+ bool prefix, bool utf8 = false) const;
// For value properties. This truncates the data by clearing it, but leaving
// the storage intact.
diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc
index 4fae52a..193765b 100644
--- a/icing/legacy/index/icing-dynamic-trie_test.cc
+++ b/icing/legacy/index/icing-dynamic-trie_test.cc
@@ -746,6 +746,222 @@ TEST_F(IcingDynamicTrieTest, Compact) {
}
}
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenRootIsLeaf) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts a key, the root is a leaf.
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("foo", &value));
+ ASSERT_TRUE(trie.Find("foo", &value));
+
+ // Deletes the key.
+ EXPECT_TRUE(trie.Delete("foo"));
+ EXPECT_FALSE(trie.Find("foo", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenLastCharIsLeaf) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "bar" and "ba", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // null r
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("ba", &value));
+ ASSERT_TRUE(trie.Find("bar", &value));
+ ASSERT_TRUE(trie.Find("ba", &value));
+
+ // Deletes "bar". "r" is a leaf node in the trie.
+ EXPECT_TRUE(trie.Delete("bar"));
+ EXPECT_FALSE(trie.Find("bar", &value));
+ EXPECT_TRUE(trie.Find("ba", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithTerminationNode) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "bar" and "ba", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // null r
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("ba", &value));
+ ASSERT_TRUE(trie.Find("bar", &value));
+ ASSERT_TRUE(trie.Find("ba", &value));
+
+ // Deletes "ba" which is a key with termination node in the trie.
+ EXPECT_TRUE(trie.Delete("ba"));
+ EXPECT_FALSE(trie.Find("ba", &value));
+ EXPECT_TRUE(trie.Find("bar", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleNexts) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "ba", "bb", "bc", and "bd", the trie structure looks like:
+ // root
+ // |
+ // b
+ // / | | \
+ // a b c d
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("ba", &value));
+ ASSERT_TRUE(trie.Insert("bb", &value));
+ ASSERT_TRUE(trie.Insert("bc", &value));
+ ASSERT_TRUE(trie.Insert("bd", &value));
+ ASSERT_TRUE(trie.Find("ba", &value));
+ ASSERT_TRUE(trie.Find("bb", &value));
+ ASSERT_TRUE(trie.Find("bc", &value));
+ ASSERT_TRUE(trie.Find("bd", &value));
+
+ // Deletes "bc".
+ EXPECT_TRUE(trie.Delete("bc"));
+ EXPECT_FALSE(trie.Find("bc", &value));
+ EXPECT_TRUE(trie.Find("ba", &value));
+ EXPECT_TRUE(trie.Find("bb", &value));
+ EXPECT_TRUE(trie.Find("bd", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleTrieBranches) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "batter", "battle", and "bar", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // t r
+ // |
+ // t
+ // / \
+ // e l
+ // | |
+ // r e
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("batter", &value));
+ ASSERT_TRUE(trie.Insert("battle", &value));
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Find("batter", &value));
+ ASSERT_TRUE(trie.Find("battle", &value));
+ ASSERT_TRUE(trie.Find("bar", &value));
+
+ // Deletes "batter".
+ EXPECT_TRUE(trie.Delete("batter"));
+ EXPECT_FALSE(trie.Find("batter", &value));
+ EXPECT_TRUE(trie.Find("battle", &value));
+ EXPECT_TRUE(trie.Find("bar", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, InsertionShouldWorkAfterDeletion) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("bed", &value));
+ ASSERT_TRUE(trie.Insert("foo", &value));
+
+ // Deletes a key
+ ASSERT_TRUE(trie.Delete("bed"));
+ ASSERT_FALSE(trie.Find("bed", &value));
+
+ // Inserts after deletion
+ EXPECT_TRUE(trie.Insert("bed", &value));
+ EXPECT_TRUE(trie.Insert("bedroom", &value));
+ EXPECT_TRUE(trie.Find("bed", &value));
+ EXPECT_TRUE(trie.Find("bedroom", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, IteratorShouldWorkAfterDeletion) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("bed", &value));
+ ASSERT_TRUE(trie.Insert("foo", &value));
+
+ // Deletes a key
+ ASSERT_TRUE(trie.Delete("bed"));
+
+ // Iterates through all keys
+ IcingDynamicTrie::Iterator iterator_all(trie, "");
+ std::vector<std::string> results;
+ for (; iterator_all.IsValid(); iterator_all.Advance()) {
+ results.emplace_back(iterator_all.GetKey());
+ }
+ EXPECT_THAT(results, ElementsAre("bar", "foo"));
+
+ // Iterates through keys that start with "b"
+ IcingDynamicTrie::Iterator iterator_b(trie, "b");
+ results.clear();
+ for (; iterator_b.IsValid(); iterator_b.Advance()) {
+ results.emplace_back(iterator_b.GetKey());
+ }
+ EXPECT_THAT(results, ElementsAre("bar"));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletingNonExistingKeyShouldReturnTrue) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_TRUE(trie.Insert("bar", &value));
+ ASSERT_TRUE(trie.Insert("bed", &value));
+
+ // "ba" and bedroom are not keys in the trie.
+ EXPECT_TRUE(trie.Delete("ba"));
+ EXPECT_TRUE(trie.Delete("bedroom"));
+
+ // The original keys are not affected.
+ EXPECT_TRUE(trie.Find("bar", &value));
+ EXPECT_TRUE(trie.Find("bed", &value));
+}
+
} // namespace
// The tests below are accessing private methods and fields of IcingDynamicTrie
diff --git a/icing/legacy/index/icing-mock-filesystem.h b/icing/legacy/index/icing-mock-filesystem.h
index 31e012a..5a064ea 100644
--- a/icing/legacy/index/icing-mock-filesystem.h
+++ b/icing/legacy/index/icing-mock-filesystem.h
@@ -31,65 +31,78 @@ namespace lib {
class IcingMockFilesystem : public IcingFilesystem {
public:
- MOCK_CONST_METHOD1(DeleteFile, bool(const char *file_name));
+ MOCK_METHOD(bool, DeleteFile, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(DeleteDirectory, bool(const char *dir_name));
+ MOCK_METHOD(bool, DeleteDirectory, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(DeleteDirectoryRecursively, bool(const char *dir_name));
+ MOCK_METHOD(bool, DeleteDirectoryRecursively, (const char *dir_name),
+ (const, override));
- MOCK_CONST_METHOD1(FileExists, bool(const char *file_name));
+ MOCK_METHOD(bool, FileExists, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(DirectoryExists, bool(const char *dir_name));
+ MOCK_METHOD(bool, DirectoryExists, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(GetBasenameIndex, int(const char *file_name));
+ MOCK_METHOD(int, GetBasenameIndex, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD1(GetBasename, std::string(const char *file_name));
+ MOCK_METHOD(std::string, GetBasename, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD1(GetDirname, std::string(const char *file_name));
+ MOCK_METHOD(std::string, GetDirname, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD2(ListDirectory, bool(const char *dir_name,
- std::vector<std::string> *entries));
+ MOCK_METHOD(bool, ListDirectory,
+ (const char *dir_name, std::vector<std::string> *entries),
+ (const, override));
- MOCK_CONST_METHOD2(GetMatchingFiles,
- bool(const char *glob, std::vector<std::string> *matches));
+ MOCK_METHOD(bool, GetMatchingFiles,
+ (const char *glob, std::vector<std::string> *matches),
+ (const, override));
- MOCK_CONST_METHOD1(OpenForWrite, int(const char *file_name));
+ MOCK_METHOD(int, OpenForWrite, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(OpenForAppend, int(const char *file_name));
+ MOCK_METHOD(int, OpenForAppend, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(OpenForRead, int(const char *file_name));
+ MOCK_METHOD(int, OpenForRead, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(GetFileSize, uint64_t(int fd));
+ MOCK_METHOD(uint64_t, GetFileSize, (int fd), (const, override));
- MOCK_CONST_METHOD1(GetFileSize, uint64_t(const char *filename));
+ MOCK_METHOD(uint64_t, GetFileSize, (const char *filename), (const, override));
- MOCK_CONST_METHOD2(Truncate, bool(int fd, uint64_t new_size));
+ MOCK_METHOD(bool, Truncate, (int fd, uint64_t new_size), (const, override));
- MOCK_CONST_METHOD2(Truncate, bool(const char *filename, uint64_t new_size));
+ MOCK_METHOD(bool, Truncate, (const char *filename, uint64_t new_size),
+ (const, override));
- MOCK_CONST_METHOD2(Grow, bool(int fd, uint64_t new_size));
+ MOCK_METHOD(bool, Grow, (int fd, uint64_t new_size), (const, override));
- MOCK_CONST_METHOD3(Write, bool(int fd, const void *data, size_t data_size));
- MOCK_CONST_METHOD4(PWrite, bool(int fd, off_t offset, const void *data,
- size_t data_size));
+ MOCK_METHOD(bool, Write, (int fd, const void *data, size_t data_size),
+ (const, override));
+ MOCK_METHOD(bool, PWrite,
+ (int fd, off_t offset, const void *data, size_t data_size),
+ (const, override));
- MOCK_CONST_METHOD1(DataSync, bool(int fd));
+ MOCK_METHOD(bool, DataSync, (int fd), (const, override));
- MOCK_CONST_METHOD2(RenameFile,
- bool(const char *old_name, const char *new_name));
+ MOCK_METHOD(bool, RenameFile, (const char *old_name, const char *new_name),
+ (const, override));
- MOCK_CONST_METHOD2(SwapFiles, bool(const char *one, const char *two));
+ MOCK_METHOD(bool, SwapFiles, (const char *one, const char *two),
+ (const, override));
- MOCK_CONST_METHOD1(CreateDirectory, bool(const char *dir_name));
+ MOCK_METHOD(bool, CreateDirectory, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(CreateDirectoryRecursively, bool(const char *dir_name));
+ MOCK_METHOD(bool, CreateDirectoryRecursively, (const char *dir_name),
+ (const, override));
- MOCK_CONST_METHOD2(CopyFile, bool(const char *src, const char *dst));
+ MOCK_METHOD(bool, CopyFile, (const char *src, const char *dst),
+ (const, override));
- MOCK_CONST_METHOD4(ComputeChecksum, bool(int fd, uint32_t *checksum,
- uint64_t offset, uint64_t length));
+ MOCK_METHOD(bool, ComputeChecksum,
+ (int fd, uint32_t *checksum, uint64_t offset, uint64_t length),
+ (const, override));
- MOCK_CONST_METHOD1(GetDiskUsage, uint64_t(const char *path));
+ MOCK_METHOD(uint64_t, GetDiskUsage, (const char *path), (const, override));
};
} // namespace lib
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index 000bf3a..29404d9 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -30,6 +30,7 @@
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -107,8 +108,9 @@ void BM_QueryOneTerm(benchmark::State& state) {
}
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -219,8 +221,9 @@ void BM_QueryFiveTerms(benchmark::State& state) {
}
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -349,8 +352,9 @@ void BM_QueryDiacriticTerm(benchmark::State& state) {
}
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
@@ -464,8 +468,9 @@ void BM_QueryHiragana(benchmark::State& state) {
}
std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
FakeClock fake_clock;
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 36dbfd9..0d2c2c5 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -36,6 +36,7 @@
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -59,8 +60,10 @@ class ResultRetrieverTest : public testing::Test {
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
- ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
SchemaStore::Create(&filesystem_, test_dir_));
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 3b3bf61..676ea92 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -40,6 +40,7 @@
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -60,8 +61,10 @@ class SnippetRetrieverTest : public testing::Test {
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
- ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
// Setup the schema
ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
diff --git a/icing/store/document-filter-data.h b/icing/store/document-filter-data.h
index 198bc49..3970132 100644
--- a/icing/store/document-filter-data.h
+++ b/icing/store/document-filter-data.h
@@ -25,6 +25,7 @@ namespace icing {
namespace lib {
using SchemaTypeId = int16_t;
+inline constexpr SchemaTypeId kInvalidSchemaTypeId = -1;
class DocumentFilterData {
public:
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 93cebaa..7577f6b 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -329,8 +329,21 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
auto iterator = document_log_->GetIterator();
auto iterator_status = iterator.Advance();
while (iterator_status.ok()) {
- ICING_ASSIGN_OR_RETURN(DocumentWrapper document_wrapper,
- document_log_->ReadProto(iterator.GetOffset()));
+ libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
+ document_log_->ReadProto(iterator.GetOffset());
+
+ if (absl_ports::IsNotFound(document_wrapper_or.status())) {
+ // The erased document still occupies 1 document id.
+ DocumentId new_document_id = document_id_mapper_->num_elements();
+ ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
+ iterator_status = iterator.Advance();
+ continue;
+ } else if (!document_wrapper_or.ok()) {
+ return document_wrapper_or.status();
+ }
+
+ DocumentWrapper document_wrapper =
+ std::move(document_wrapper_or).ValueOrDie();
if (document_wrapper.deleted()) {
if (!document_wrapper.document().uri().empty()) {
// Individual document deletion.
@@ -351,17 +364,22 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
}
} else if (!document_wrapper.document().namespace_().empty()) {
// Namespace deletion.
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesNamespaceDeleted(
- document_wrapper.document().namespace_()));
-
+ ICING_ASSIGN_OR_RETURN(
+ NamespaceId namespace_id,
+ namespace_mapper_->Get(document_wrapper.document().namespace_()));
+ // Tombstone indicates it's a soft delete.
+ ICING_RETURN_IF_ERROR(BatchDelete(namespace_id, kInvalidSchemaTypeId,
+ /*soft_delete=*/true));
} else if (!document_wrapper.document().schema().empty()) {
// SchemaType deletion.
auto schema_type_id_or = schema_store_->GetSchemaTypeId(
document_wrapper.document().schema());
if (schema_type_id_or.ok()) {
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(
- schema_type_id_or.ValueOrDie()));
+ // Tombstone indicates it's a soft delete.
+ ICING_RETURN_IF_ERROR(BatchDelete(kInvalidNamespaceId,
+ schema_type_id_or.ValueOrDie(),
+ /*soft_delete=*/true));
} else {
// The deleted schema type doesn't have a SchemaTypeId we can refer
// to in the FilterCache.
@@ -845,7 +863,8 @@ bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
}
libtextclassifier3::Status DocumentStore::Delete(
- const std::string_view name_space, const std::string_view uri) {
+ const std::string_view name_space, const std::string_view uri,
+ bool soft_delete) {
// Try to get the DocumentId first
auto document_id_or = GetDocumentId(name_space, uri);
if (!document_id_or.ok()) {
@@ -865,25 +884,61 @@ libtextclassifier3::Status DocumentStore::Delete(
", uri: ", uri));
}
+ if (soft_delete) {
+ return SoftDelete(name_space, uri, document_id);
+ } else {
+ uint64_t document_log_offset = file_offset_or.ValueOrDie();
+ return HardDelete(document_id, document_log_offset);
+ }
+}
+
+libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
+ bool soft_delete) {
+ // Copy out the document to get namespace and uri.
+ ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
+ DoesDocumentExistAndGetFileOffset(document_id));
+
+ if (soft_delete) {
+ auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
+ if (!document_wrapper_or.ok()) {
+ ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
+ << "Failed to read from document log";
+ return document_wrapper_or.status();
+ }
+ DocumentWrapper document_wrapper =
+ std::move(document_wrapper_or).ValueOrDie();
+
+ return SoftDelete(document_wrapper.document().namespace_(),
+ document_wrapper.document().uri(), document_id);
+ } else {
+ return HardDelete(document_id, document_log_offset);
+ }
+}
+
+libtextclassifier3::Status DocumentStore::SoftDelete(
+ std::string_view name_space, std::string_view uri, DocumentId document_id) {
// Update ground truth first.
- // To delete a proto we don't directly remove it. Instead, we mark it as
- // deleted first by appending a tombstone of it and actually remove it from
- // file later in Optimize()
- // TODO(b/144458732): Implement a more robust version of ICING_RETURN_IF_ERROR
- // that can support error logging.
+ // Mark the document as deleted by appending a tombstone of it and actually
+ // remove it from file later in Optimize()
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
libtextclassifier3::Status status =
document_log_->WriteProto(CreateDocumentTombstone(name_space, uri))
.status();
if (!status.ok()) {
return absl_ports::Annotate(
- status, absl_ports::StrCat("Failed to delete Document. namespace: ",
+ status, absl_ports::StrCat("Failed to delete Document. namespace:",
name_space, ", uri: ", uri));
}
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id_or.ValueOrDie(), kDocDeletedFlag));
+ return document_id_mapper_->Set(document_id, kDocDeletedFlag);
+}
- return libtextclassifier3::Status::OK;
+libtextclassifier3::Status DocumentStore::HardDelete(
+ DocumentId document_id, uint64_t document_log_offset) {
+ // Erases document proto.
+ ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
+ return ClearDerivedData(document_id);
}
libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
@@ -899,7 +954,14 @@ DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
<< " from score_cache_";
return score_data_or.status();
}
- return *std::move(score_data_or).ValueOrDie();
+
+ DocumentAssociatedScoreData document_associated_score_data =
+ *std::move(score_data_or).ValueOrDie();
+ if (document_associated_score_data.document_score() < 0) {
+ // An negative / invalid score means that the score data has been deleted.
+ return absl_ports::NotFoundError("Document score data not found.");
+ }
+ return document_associated_score_data;
}
libtextclassifier3::StatusOr<DocumentFilterData>
@@ -910,135 +972,157 @@ DocumentStore::GetDocumentFilterData(DocumentId document_id) const {
<< " from filter_cache_";
return filter_data_or.status();
}
- return *std::move(filter_data_or).ValueOrDie();
+ DocumentFilterData document_filter_data =
+ *std::move(filter_data_or).ValueOrDie();
+ if (document_filter_data.namespace_id() == kInvalidNamespaceId) {
+ // An invalid namespace id means that the filter data has been deleted.
+ return absl_ports::NotFoundError("Document filter data not found.");
+ }
+ return document_filter_data;
}
libtextclassifier3::Status DocumentStore::DeleteByNamespace(
- std::string_view name_space) {
+ std::string_view name_space, bool soft_delete) {
auto namespace_id_or = namespace_mapper_->Get(name_space);
if (!namespace_id_or.ok()) {
return absl_ports::Annotate(
namespace_id_or.status(),
- absl_ports::StrCat("Failed to delete by namespace. namespace: ",
- name_space));
+ absl_ports::StrCat("Failed to find namespace: ", name_space));
}
+ NamespaceId namespace_id = namespace_id_or.ValueOrDie();
- // Update ground truth first.
- // To delete an entire namespace, we append a tombstone that only contains
- // the deleted bit and the name of the deleted namespace.
- // TODO(b/144458732): Implement a more robust version of
- // ICING_RETURN_IF_ERROR that can support error logging.
- libtextclassifier3::Status status =
- document_log_->WriteProto(CreateNamespaceTombstone(name_space)).status();
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete namespace. namespace = "
- << name_space;
- return status;
+ int num_updated_documents = 0;
+ if (soft_delete) {
+ // To delete an entire namespace, we append a tombstone that only contains
+ // the deleted bit and the name of the deleted namespace.
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
+ libtextclassifier3::Status status =
+ document_log_->WriteProto(CreateNamespaceTombstone(name_space))
+ .status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete namespace. namespace = "
+ << name_space;
+ return status;
+ }
}
- ICING_ASSIGN_OR_RETURN(bool updated_existing_document,
- UpdateDerivedFilesNamespaceDeleted(name_space));
- if (!updated_existing_document) {
+ ICING_ASSIGN_OR_RETURN(
+ num_updated_documents,
+ BatchDelete(namespace_id, kInvalidSchemaTypeId, soft_delete));
+
+ if (num_updated_documents <= 0) {
// Treat the fact that no existing documents had this namespace to be the
// same as this namespace not existing at all.
return absl_ports::NotFoundError(
absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
}
- return libtextclassifier3::Status::OK;
-}
-libtextclassifier3::StatusOr<bool>
-DocumentStore::UpdateDerivedFilesNamespaceDeleted(std::string_view name_space) {
- auto namespace_id_or = namespace_mapper_->Get(name_space);
- if (!namespace_id_or.ok()) {
- return namespace_id_or.status();
- }
-
- // Guaranteed to have a NamespaceId now.
- NamespaceId namespace_id = namespace_id_or.ValueOrDie();
-
- // Tracks if there were any existing documents with this namespace that we
- // will mark as deleted.
- bool updated_existing_document = false;
-
- // Traverse FilterCache and delete all docs that match namespace_id
- for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
- ++document_id) {
- // filter_cache_->Get can only fail if document_id is < 0
- // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
- ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
- filter_cache_->Get(document_id));
- if (data->namespace_id() == namespace_id) {
- if (DoesDocumentExist(document_id)) {
- updated_existing_document = true;
- }
-
- // docid_mapper_->Set can only fail if document_id is < 0
- // or >= docid_mapper_->num_elements. So the only possible way to get an
- // error here would be if filter_cache_->num_elements >
- // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id, kDocDeletedFlag));
- }
- }
-
- return updated_existing_document;
+ return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status DocumentStore::DeleteBySchemaType(
- std::string_view schema_type) {
+ std::string_view schema_type, bool soft_delete) {
auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
if (!schema_type_id_or.ok()) {
return absl_ports::Annotate(
schema_type_id_or.status(),
- absl_ports::StrCat("Failed to delete by schema type. schema_type: ",
+ absl_ports::StrCat("Failed to find schema type. schema_type: ",
schema_type));
}
+ SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
- // Update ground truth first.
- // To delete an entire schema type, we append a tombstone that only contains
- // the deleted bit and the name of the deleted schema type.
- // TODO(b/144458732): Implement a more robust version of
- // ICING_RETURN_IF_ERROR that can support error logging.
- libtextclassifier3::Status status =
- document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
- .status();
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete schema_type. schema_type = "
- << schema_type;
- return status;
+ int num_updated_documents = 0;
+ if (soft_delete) {
+ // To soft-delete an entire schema type, we append a tombstone that only
+ // contains the deleted bit and the name of the deleted schema type.
+ // TODO(b/144458732): Implement a more robust version of
+ // ICING_RETURN_IF_ERROR that can support error logging.
+ libtextclassifier3::Status status =
+ document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
+ .status();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete schema_type. schema_type = "
+ << schema_type;
+ return status;
+ }
}
- // Guaranteed to have a SchemaTypeId now
- SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
+ ICING_ASSIGN_OR_RETURN(
+ num_updated_documents,
+ BatchDelete(kInvalidNamespaceId, schema_type_id, soft_delete));
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(schema_type_id));
+ if (num_updated_documents <= 0) {
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "No documents found with schema type '", schema_type, "'"));
+ }
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status DocumentStore::UpdateDerivedFilesSchemaTypeDeleted(
- SchemaTypeId schema_type_id) {
- // Traverse FilterCache and delete all docs that match schema_type_id.
+libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
+ NamespaceId namespace_id, SchemaTypeId schema_type_id, bool soft_delete) {
+ // Tracks if there were any existing documents with this namespace that we
+ // will mark as deleted.
+ int num_updated_documents = 0;
+
+ // Traverse FilterCache and delete all docs that match namespace_id and
+ // schema_type_id.
for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
++document_id) {
// filter_cache_->Get can only fail if document_id is < 0
// or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
filter_cache_->Get(document_id));
- if (data->schema_type_id() == schema_type_id) {
+
+ // Check namespace only when the input namespace id is valid.
+ if (namespace_id != kInvalidNamespaceId &&
+ (data->namespace_id() == kInvalidNamespaceId ||
+ data->namespace_id() != namespace_id)) {
+ // The document has already been hard-deleted or isn't from the desired
+ // namespace.
+ continue;
+ }
+
+ // Check schema type only when the input schema type id is valid.
+ if (schema_type_id != kInvalidSchemaTypeId &&
+ (data->schema_type_id() == kInvalidSchemaTypeId ||
+ data->schema_type_id() != schema_type_id)) {
+ // The document has already been hard-deleted or doesn't have the
+ // desired schema type.
+ continue;
+ }
+
+ // The document has the desired namespace and schema type, it either exists
+ // or has been soft-deleted / expired.
+ if (soft_delete) {
+ if (DoesDocumentExist(document_id)) {
+ ++num_updated_documents;
+ }
+
// docid_mapper_->Set can only fail if document_id is < 0
// or >= docid_mapper_->num_elements. So the only possible way to get an
// error here would be if filter_cache_->num_elements >
// docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
ICING_RETURN_IF_ERROR(
document_id_mapper_->Set(document_id, kDocDeletedFlag));
+ } else {
+ // Hard delete.
+ libtextclassifier3::Status delete_status =
+ Delete(document_id, /*soft_delete=*/false);
+ if (absl_ports::IsNotFound(delete_status)) {
+ continue;
+ } else if (!delete_status.ok()) {
+ // Real error, pass up.
+ return delete_status;
+ }
+ ++num_updated_documents;
}
}
- return libtextclassifier3::Status::OK;
+ return num_updated_documents;
}
libtextclassifier3::Status DocumentStore::PersistToDisk() {
@@ -1328,5 +1412,26 @@ libtextclassifier3::Status DocumentStore::UpdateFilterCache(
return filter_cache_->Set(document_id, filter_data);
}
+libtextclassifier3::Status DocumentStore::ClearDerivedData(
+ DocumentId document_id) {
+ // We intentionally leave the data in key_mapper_ because locating that data
+ // requires fetching namespace and uri. Leaving data in key_mapper_ should be
+ // fine because the data is hashed.
+
+ ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
+
+ // Resets the score cache entry
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ document_id, DocumentAssociatedScoreData(/*document_score=*/-1,
+ /*creation_timestamp_ms=*/-1)));
+
+ // Resets the filter cache entry
+ ICING_RETURN_IF_ERROR(UpdateFilterCache(
+ document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
+ /*expiration_timestamp_ms=*/-1)));
+
+ return libtextclassifier3::Status::OK;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 3f4b72f..2ac1c71 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -147,17 +147,41 @@ class DocumentStore {
// boolean whether a document exists or not
bool DoesDocumentExist(DocumentId document_id) const;
- // Deletes the document identified by the given namespace and uri
+ // Deletes the document identified by the given namespace and uri. The
+ // document proto will be marked as deleted if 'soft_delete' is true,
+ // otherwise the document proto will be erased immediately.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // NOTE:
+ // 1. The soft deletion uses less CPU power, it can be applied on
+ // non-sensitive data.
+ // 2. Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
// NOT_FOUND if no document exists with namespace, uri
// INTERNAL_ERROR on IO error
libtextclassifier3::Status Delete(std::string_view name_space,
- std::string_view uri);
+ std::string_view uri,
+ bool soft_delete = false);
+
+ // Deletes the document identified by the given document_id. The
+ // document proto will be marked as deleted if 'soft_delete' is true,
+ // otherwise the document proto will be erased immediately.
+ //
+ // NOTE:
+ // 1. If possible, please use the other method Delete(name_space, uri,
+ // soft_delete) for soft deletes because we need namespace and uri to
+ // perform soft deletes.
+ // 2. Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ // INVALID_ARGUMENT if document_id is invalid.
+ libtextclassifier3::Status Delete(DocumentId document_id,
+ bool soft_delete = false);
// Returns the NamespaceId of the string namespace
//
@@ -180,6 +204,7 @@ class DocumentStore {
// DocumentAssociatedScoreData on success
// OUT_OF_RANGE if document_id is negative or exceeds previously seen
// DocumentIds
+ // NOT_FOUND if no score data is found
libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const;
@@ -194,30 +219,43 @@ class DocumentStore {
// DocumentFilterData on success
// OUT_OF_RANGE if document_id is negative or exceeds previously seen
// DocumentIds
+ // NOT_FOUND if no filter data is found
libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
DocumentId document_id) const;
- // Deletes all documents belonging to the given namespace.
+ // Deletes all documents belonging to the given namespace. The documents will
+ // be marked as deleted if 'soft_delete' is true, otherwise they will be
+ // erased immediately.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // NOTE:
+ // 1. The soft deletion uses less CPU power, it can be applied on
+ // non-sensitive data.
+ // 2. Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
// NOT_FOUND if namespace doesn't exist
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status DeleteByNamespace(std::string_view name_space);
+ libtextclassifier3::Status DeleteByNamespace(std::string_view name_space,
+ bool soft_delete = false);
- // Deletes all documents belonging to the given schema type
+ // Deletes all documents belonging to the given schema type. The documents
+ // will be marked as deleted if 'soft_delete' is true, otherwise they will be
+ // erased immediately.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // NOTE:
+ // 1. The soft deletion uses less CPU power, it can be applied on
+ // non-sensitive data.
+ // 2. Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
// NOT_FOUND if schema_type doesn't exist
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type);
+ libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type,
+ bool soft_delete = false);
// Syncs all the data and metadata changes to disk.
//
@@ -424,32 +462,42 @@ class DocumentStore {
// INTERNAL on I/O error
libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
- // Update derived files that `name_space` has been deleted. This is primarily
- // useful if we're trying to update derived files when we've already seen a
- // namespace tombstone, and don't need to write another tombstone.
+ // Helper function to do batch deletes. Documents with the given
+ // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
+ // to specify the namespace or schema type, pass in kInvalidNamespaceId or
+ // kInvalidSchemaTypeId. The document protos will be marked as deleted if
+ // 'soft_delete' is true, otherwise the document protos with their derived
+ // data will be erased / cleared immediately.
//
// NOTE: Space is not reclaimed in the derived files until Optimize() is
// called.
//
// Returns:
- // bool on whether an existing document was actually updated to be deleted
+ // Number of documents that were actually updated to be deleted
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<bool> UpdateDerivedFilesNamespaceDeleted(
- std::string_view name_space);
+ libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
+ SchemaTypeId schema_type_id,
+ bool soft_delete);
- // Update derived files that the schema type schema_type_id has been deleted.
- // This is primarily useful if we're trying to update derived files when we've
- // already seen a schema type tombstone, and don't need to write another
- // tombstone.
+ // Marks the document identified by the given name_space, uri and document_id
+ // as deleted, to be removed later during Optimize().
//
- // NOTE: Space is not reclaimed in the derived files until Optimize() is
- // called.
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status SoftDelete(std::string_view name_space,
+ std::string_view uri,
+ DocumentId document_id);
+
+ // Erases the document at the given document_log_offset from the document_log
+ // and clears the derived data identified by the given document_id. The space
+ // will be reclaimed later during Optimize().
//
// Returns:
// OK on success
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status UpdateDerivedFilesSchemaTypeDeleted(
- SchemaTypeId schema_type_id);
+ libtextclassifier3::Status HardDelete(DocumentId document_id,
+ uint64_t document_log_offset);
// Helper method to find a DocumentId that is associated with the given
// namespace and uri.
@@ -488,6 +536,9 @@ class DocumentStore {
// Updates the entry in the filter cache for document_id.
libtextclassifier3::Status UpdateFilterCache(
DocumentId document_id, const DocumentFilterData& filter_data);
+
+ // Helper method to clear the derived data of a document
+ libtextclassifier3::Status ClearDerivedData(DocumentId document_id);
};
} // namespace lib
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index ad56b9a..f857481 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -60,9 +60,6 @@ class DocumentStoreTest : public ::testing::Test {
: test_dir_(GetTestTempDir() + "/icing"),
document_store_dir_(test_dir_ + "/document_store"),
schema_store_dir_(test_dir_ + "/schema_store") {
- filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
test_document1_ =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -88,6 +85,11 @@ class DocumentStoreTest : public ::testing::Test {
}
void SetUp() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
SchemaProto schema;
auto type_config = schema.add_types();
type_config->set_schema_type("email");
@@ -270,7 +272,7 @@ TEST_F(DocumentStoreTest, IsDocumentExisting) {
IsFalse());
}
-TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) {
+TEST_F(DocumentStoreTest, GetSoftDeletedDocumentNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -281,7 +283,26 @@ TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) {
IsOkAndHolds(EqualsProto(test_document1_)));
ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
- test_document1_.uri()));
+ test_document1_.uri(),
+ /*soft_delete=*/true));
+ EXPECT_THAT(
+ document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, GetHardDeletedDocumentNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
+ EXPECT_THAT(
+ document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri(),
+ /*soft_delete=*/false));
EXPECT_THAT(
document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -343,20 +364,6 @@ TEST_F(DocumentStoreTest, GetInvalidDocumentId) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteOk) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
-
- // Get() after Delete() returns NOT_FOUND
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- doc_store->Put(DocumentProto(test_document1_)));
- EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
- EXPECT_THAT(doc_store->Get(document_id),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
@@ -394,7 +401,7 @@ TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceOk) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -422,7 +429,8 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
// DELETE namespace.1. document1 and document 4 should be deleted. document2
// and document3 should still be retrievable.
- ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace.1"));
+ ICING_EXPECT_OK(
+ doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/true));
EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
@@ -433,7 +441,67 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ DocumentProto document1 = test_document1_;
+ document1.set_namespace_("namespace.1");
+ document1.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document1));
+
+ DocumentProto document2 = test_document1_;
+ document2.set_namespace_("namespace.2");
+ document2.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document2));
+
+ DocumentProto document3 = test_document1_;
+ document3.set_namespace_("namespace.3");
+ document3.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document3));
+
+ DocumentProto document4 = test_document1_;
+ document4.set_namespace_("namespace.1");
+ document4.set_uri("uri2");
+ ICING_ASSERT_OK(doc_store->Put(document4));
+
+ // DELETE namespace.1. document1 and document 4 should be deleted. document2
+ // and document3 should still be retrievable.
+ ICING_EXPECT_OK(
+ doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/false));
+ EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
+ IsOkAndHolds(EqualsProto(document2)));
+ EXPECT_THAT(doc_store->Get(document3.namespace_(), document3.uri()),
+ IsOkAndHolds(EqualsProto(document3)));
+ EXPECT_THAT(doc_store->Get(document4.namespace_(), document4.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNonexistentNamespaceNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Validates that deleting something non-existing won't append anything to
+ // ground truth
+ int64_t ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace",
+ /*soft_delete=*/true),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceNonexistentNamespaceNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -444,7 +512,8 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
int64_t ground_truth_size_before = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace"),
+ EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace",
+ /*soft_delete=*/false),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
int64_t ground_truth_size_after = filesystem_.GetFileSize(
@@ -452,7 +521,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) {
+TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNoExistingDocumentsNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -464,7 +533,25 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) {
// At this point, there are no existing documents with the namespace, even
// though Icing's derived files know about this namespace. We should still
// return NOT_FOUND since nothing existing has this namespace.
- EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_()),
+ EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_(),
+ /*soft_delete=*/true),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, HardDeleteByNamespaceNoExistingDocumentsNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+
+ // At this point, there are no existing documents with the namespace, even
+ // though Icing's derived files know about this namespace. We should still
+ // return NOT_FOUND since nothing existing has this namespace.
+ EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_(),
+ /*soft_delete=*/false),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
@@ -536,7 +623,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeOk) {
SchemaProto schema;
auto type_config = schema.add_types();
type_config->set_schema_type("email");
@@ -593,7 +680,8 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
// Delete the "email" type and ensure that it works across both
// email_document's namespaces. And that other documents aren't affected.
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
+ ICING_EXPECT_OK(
+ document_store->DeleteBySchemaType("email", /*soft_delete=*/true));
EXPECT_THAT(document_store->Get(email_1_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(document_store->Get(email_2_document_id),
@@ -604,7 +692,8 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
IsOkAndHolds(EqualsProto(person_document)));
// Delete the "message" type and check that other documents aren't affected
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("message"));
+ ICING_EXPECT_OK(
+ document_store->DeleteBySchemaType("message", /*soft_delete=*/true));
EXPECT_THAT(document_store->Get(email_1_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(document_store->Get(email_2_document_id),
@@ -615,7 +704,109 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
IsOkAndHolds(EqualsProto(person_document)));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("email");
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+ type_config = schema.add_types();
+ type_config->set_schema_type("person");
+
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+
+ DocumentProto email_document_1 = DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_1_document_id,
+ document_store->Put(email_document_1));
+
+ DocumentProto email_document_2 = DocumentBuilder()
+ .SetKey("namespace2", "2")
+ .SetSchema("email")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_2_document_id,
+ document_store->Put(email_document_2));
+
+ DocumentProto message_document = DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema("message")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store->Put(message_document));
+
+ DocumentProto person_document = DocumentBuilder()
+ .SetKey("namespace", "4")
+ .SetSchema("person")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id,
+ document_store->Put(person_document));
+
+ // Delete the "email" type and ensure that it works across both
+ // email_document's namespaces. And that other documents aren't affected.
+ ICING_EXPECT_OK(
+ document_store->DeleteBySchemaType("email", /*soft_delete=*/false));
+ EXPECT_THAT(document_store->Get(email_1_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(email_2_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ IsOkAndHolds(EqualsProto(message_document)));
+ EXPECT_THAT(document_store->Get(person_document_id),
+ IsOkAndHolds(EqualsProto(person_document)));
+
+ // Delete the "message" type and check that other documents aren't affected
+ ICING_EXPECT_OK(
+ document_store->DeleteBySchemaType("message", /*soft_delete=*/false));
+ EXPECT_THAT(document_store->Get(email_1_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(email_2_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(message_document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(person_document_id),
+ IsOkAndHolds(EqualsProto(person_document)));
+}
+
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ // Validates that deleting something non-existing won't append anything to
+ // ground truth
+ int64_t ground_truth_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type",
+ /*soft_delete=*/true),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ int64_t ground_truth_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+
+ EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+}
+
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -626,7 +817,8 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
int64_t ground_truth_size_before = filesystem_.GetFileSize(
absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type"),
+ EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type",
+ /*soft_delete=*/false),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
int64_t ground_truth_size_after = filesystem_.GetFileSize(
@@ -635,7 +827,21 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsOk) {
+TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNoExistingDocumentsNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> document_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri()));
+
+ EXPECT_THAT(document_store->DeleteBySchemaType(test_document1_.schema(),
+ /*soft_delete=*/true),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNoExistingDocumentsNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> document_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -644,10 +850,9 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsOk) {
ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
test_document1_.uri()));
- // At this point, there are no existing documents with the schema type, but we
- // still return OK because the SchemaStore is the ground truth on schemas and
- // knows about the type
- ICING_EXPECT_OK(document_store->DeleteBySchemaType(test_document1_.schema()));
+ EXPECT_THAT(document_store->DeleteBySchemaType(test_document1_.schema(),
+ /*soft_delete=*/false),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
@@ -1177,7 +1382,7 @@ TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, FilterCacheHoldsDeletedDocumentData) {
+TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocumentStore> doc_store,
DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
@@ -1193,14 +1398,71 @@ TEST_F(DocumentStoreTest, FilterCacheHoldsDeletedDocumentData) {
/*schema_type_id=*/0,
/*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
- // FilterCache doesn't care if the document has been deleted
- ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
+ // Associated entry of the deleted document is removed.
+ EXPECT_THAT(doc_store->GetDocumentFilterData(document_id).status(), IsOk());
+}
+
+TEST_F(DocumentStoreTest, HardDeleteClearsFilterCache) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
EXPECT_THAT(
doc_store->GetDocumentFilterData(document_id),
IsOkAndHolds(DocumentFilterData(
/*namespace_id=*/0,
/*schema_type_id=*/0,
/*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
+
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+ // Associated entry of the deleted document is removed.
+ EXPECT_THAT(doc_store->GetDocumentFilterData(document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearScoreCache) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*document_score=*/document1_score_,
+ /*creation_timestamp_ms=*/document1_creation_timestamp_)));
+
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true));
+ // Associated entry of the deleted document is removed.
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id).status(),
+ IsOk());
+}
+
+TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocumentStore> doc_store,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*document_score=*/document1_score_,
+ /*creation_timestamp_ms=*/document1_creation_timestamp_)));
+
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false));
+ // Associated entry of the deleted document is removed.
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(DocumentStoreTest,
diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h
index 4571df2..23c7b69 100644
--- a/icing/store/key-mapper.h
+++ b/icing/store/key-mapper.h
@@ -84,6 +84,9 @@ class KeyMapper {
// Returns any encountered IO errors.
libtextclassifier3::StatusOr<T> Get(std::string_view key) const;
+ // Deletes data related to the given key. Returns true on success.
+ bool Delete(std::string_view key);
+
// Returns a map of values to keys. Empty map if the mapper is empty.
std::unordered_map<T, std::string> GetValuesToKeys() const;
@@ -255,6 +258,11 @@ libtextclassifier3::StatusOr<T> KeyMapper<T>::Get(std::string_view key) const {
}
template <typename T>
+bool KeyMapper<T>::Delete(std::string_view key) {
+ return trie_.Delete(key);
+}
+
+template <typename T>
std::unordered_map<T, std::string> KeyMapper<T>::GetValuesToKeys() const {
std::unordered_map<T, std::string> values_to_keys;
for (IcingDynamicTrie::Iterator itr(trie_, /*prefix=*/""); itr.IsValid();
diff --git a/icing/store/namespace-id.h b/icing/store/namespace-id.h
index 4225be3..374e7a8 100644
--- a/icing/store/namespace-id.h
+++ b/icing/store/namespace-id.h
@@ -22,6 +22,7 @@ namespace lib {
// Id of unique namespace in DocumentProto. Generated in DocumentStore.
using NamespaceId = int16_t;
+inline constexpr NamespaceId kInvalidNamespaceId = -1;
} // namespace lib
} // namespace icing
diff --git a/icing/store/usage-store.cc b/icing/store/usage-store.cc
new file mode 100644
index 0000000..911c45a
--- /dev/null
+++ b/icing/store/usage-store.cc
@@ -0,0 +1,193 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/usage-store.h"
+
+#include "icing/file/file-backed-vector.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+std::string MakeUsageScoreCacheFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/usage-scores");
+}
+} // namespace
+
+libtextclassifier3::StatusOr<std::unique_ptr<UsageStore>> UsageStore::Create(
+ const Filesystem* filesystem, const std::string& base_dir) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+
+ auto usage_score_cache_or = FileBackedVector<UsageScores>::Create(
+ *filesystem, MakeUsageScoreCacheFilename(base_dir),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC);
+
+ if (!usage_score_cache_or.ok()) {
+ ICING_LOG(ERROR) << usage_score_cache_or.status().error_message()
+ << "Failed to initialize usage_score_cache";
+ return usage_score_cache_or.status();
+ }
+
+ return std::unique_ptr<UsageStore>(new UsageStore(
+ std::move(usage_score_cache_or).ValueOrDie(), *filesystem, base_dir));
+}
+
+libtextclassifier3::Status UsageStore::AddUsageReport(const UsageReport& report,
+ DocumentId document_id) {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id %d is invalid.", document_id));
+ }
+
+ auto usage_scores_or = usage_score_cache_->Get(document_id);
+
+ // OutOfRange means that the mapper hasn't seen this document id before, it's
+ // not an error here.
+ UsageScores usage_scores;
+ if (usage_scores_or.ok()) {
+ usage_scores = *std::move(usage_scores_or).ValueOrDie();
+ } else if (!absl_ports::IsOutOfRange(usage_scores_or.status())) {
+ // Real error
+ return usage_scores_or.status();
+ }
+
+ // Update last used timestamps and type counts. The counts won't be
+ // incremented if they are already the maximum values. The timestamp from
+ // UsageReport is in milliseconds, we need to convert it to seconds.
+ int64_t report_timestamp_s = report.usage_timestamp_ms() / 1000;
+
+ switch (report.usage_type()) {
+ case UsageReport::USAGE_TYPE1:
+ if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) {
+ usage_scores.usage_type1_last_used_timestamp_s =
+ std::numeric_limits<uint32_t>::max();
+ } else if (report_timestamp_s >
+ usage_scores.usage_type1_last_used_timestamp_s) {
+ usage_scores.usage_type1_last_used_timestamp_s = report_timestamp_s;
+ }
+
+ if (usage_scores.usage_type1_count < std::numeric_limits<int>::max()) {
+ ++usage_scores.usage_type1_count;
+ }
+ break;
+ case UsageReport::USAGE_TYPE2:
+ if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) {
+ usage_scores.usage_type2_last_used_timestamp_s =
+ std::numeric_limits<uint32_t>::max();
+ } else if (report_timestamp_s >
+ usage_scores.usage_type2_last_used_timestamp_s) {
+ usage_scores.usage_type2_last_used_timestamp_s = report_timestamp_s;
+ }
+
+ if (usage_scores.usage_type2_count < std::numeric_limits<int>::max()) {
+ ++usage_scores.usage_type2_count;
+ }
+ break;
+ case UsageReport::USAGE_TYPE3:
+ if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) {
+ usage_scores.usage_type3_last_used_timestamp_s =
+ std::numeric_limits<uint32_t>::max();
+ } else if (report_timestamp_s >
+ usage_scores.usage_type3_last_used_timestamp_s) {
+ usage_scores.usage_type3_last_used_timestamp_s = report_timestamp_s;
+ }
+
+ if (usage_scores.usage_type3_count < std::numeric_limits<int>::max()) {
+ ++usage_scores.usage_type3_count;
+ }
+ }
+
+ // Write updated usage scores to file.
+ ICING_RETURN_IF_ERROR(usage_score_cache_->Set(document_id, usage_scores));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status UsageStore::DeleteUsageScores(
+ DocumentId document_id) {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id %d is invalid.", document_id));
+ }
+
+ // Clear all the scores of the document.
+ ICING_RETURN_IF_ERROR(usage_score_cache_->Set(document_id, UsageScores()));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<UsageStore::UsageScores>
+UsageStore::GetUsageScores(DocumentId document_id) {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id %d is invalid.", document_id));
+ }
+
+ auto usage_scores_or = usage_score_cache_->Get(document_id);
+ if (absl_ports::IsOutOfRange(usage_scores_or.status())) {
+ // No usage scores found. Return the default scores.
+ return UsageScores();
+ } else if (!usage_scores_or.ok()) {
+ // Pass up any other errors.
+ return usage_scores_or.status();
+ }
+
+ return *std::move(usage_scores_or).ValueOrDie();
+}
+
+libtextclassifier3::Status UsageStore::SetUsageScores(
+ DocumentId document_id, UsageScores usage_scores) {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id %d is invalid.", document_id));
+ }
+
+ ICING_RETURN_IF_ERROR(usage_score_cache_->Set(document_id, usage_scores));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status UsageStore::PersistToDisk() {
+ ICING_RETURN_IF_ERROR(usage_score_cache_->PersistToDisk());
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status UsageStore::Reset() {
+ // We delete all the scores by deleting the whole file.
+ libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
+ filesystem_, MakeUsageScoreCacheFilename(base_dir_));
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete usage_score_cache";
+ return status;
+ }
+
+ // Create a new usage_score_cache
+ auto usage_score_cache_or = FileBackedVector<UsageScores>::Create(
+ filesystem_, MakeUsageScoreCacheFilename(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC);
+ if (!usage_score_cache_or.ok()) {
+ ICING_LOG(ERROR) << usage_score_cache_or.status().error_message()
+ << "Failed to re-create usage_score_cache";
+ return usage_score_cache_or.status();
+ }
+ usage_score_cache_ = std::move(usage_score_cache_or).ValueOrDie();
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/usage-store.h b/icing/store/usage-store.h
new file mode 100644
index 0000000..9a8c286
--- /dev/null
+++ b/icing/store/usage-store.h
@@ -0,0 +1,160 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+
+#include "icing/file/file-backed-vector.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/store/document-id.h"
+
+#ifndef ICING_STORE_USAGE_STORE_H_
+#define ICING_STORE_USAGE_STORE_H_
+
+namespace icing {
+namespace lib {
+
+// A storage class that maintains scores that are calculated based on usage
+// reports.
+class UsageStore {
+ public:
+ // Factory function to create a UsageStore instance. The base directory is
+ // used to persist usage scores. If a usage store was previously created with
+ // this directory, it will reload the files saved by the last instance.
+ //
+ // TODO(b/169594617): consider returning StatusOr<UsageStore>
+ //
+ // Returns:
+ // A UsageStore on success
+ // FAILED_PRECONDITION on any null pointer input
+ // INTERNAL_ERROR on I/O error
+ static libtextclassifier3::StatusOr<std::unique_ptr<UsageStore>> Create(
+ const Filesystem* filesystem, const std::string& base_dir);
+
+ // The scores here reflect the timestamps and usage types defined in
+ // usage.proto.
+ struct UsageScores {
+ // The latest timestamp in seconds reported with custom usage type 1.
+ uint32_t usage_type1_last_used_timestamp_s = 0;
+
+ // The latest timestamp in seconds reported with custom usage type 2.
+ uint32_t usage_type2_last_used_timestamp_s = 0;
+
+ // The latest timestamp in seconds reported with custom usage type 3.
+ uint32_t usage_type3_last_used_timestamp_s = 0;
+
+ // Count of reports with custom usage type 1
+ int usage_type1_count = 0;
+
+ // Count of reports with custom usage type 2
+ int usage_type2_count = 0;
+
+ // Count of reports with custom usage type 3
+ int usage_type3_count = 0;
+
+ bool operator==(const UsageScores& other) const {
+ return usage_type1_last_used_timestamp_s ==
+ other.usage_type1_last_used_timestamp_s &&
+ usage_type2_last_used_timestamp_s ==
+ other.usage_type2_last_used_timestamp_s &&
+ usage_type3_last_used_timestamp_s ==
+ other.usage_type3_last_used_timestamp_s &&
+ usage_type1_count == other.usage_type1_count &&
+ usage_type2_count == other.usage_type2_count &&
+ usage_type3_count == other.usage_type3_count;
+ }
+ };
+
+ // Adds one usage report. The corresponding usage scores of the specified
+ // document will be updated.
+ //
+ // Note: changes are written to disk automatically, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if document_id is invalid
+ // INTERNAL_ERROR on I/O errors.
+ libtextclassifier3::Status AddUsageReport(const UsageReport& report,
+ DocumentId document_id);
+
+ // Deletes the usage scores of a document.
+ //
+ // Note: changes are written to disk automatically, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if document_id is invalid
+ // INTERNAL_ERROR on I/O errors
+ libtextclassifier3::Status DeleteUsageScores(DocumentId document_id);
+
+ // Gets the usage scores of a document.
+ //
+ // Returns:
+ // UsageScores on success
+ // INVALID_ARGUMENT if document_id is invalid
+ // NOT_FOUND if no scores are found for the document
+ // INTERNAL_ERROR on I/O errors
+ //
+ // TODO(b/169433395): return a pointer instead of an object.
+ libtextclassifier3::StatusOr<UsageScores> GetUsageScores(
+ DocumentId document_id);
+
+ // Sets the usage scores of a document.
+ //
+ // Note: changes are written to disk automatically, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if document_id is invalid
+ // INTERNAL_ERROR on I/O errors
+ libtextclassifier3::Status SetUsageScores(DocumentId document_id,
+ UsageScores usage_scores);
+
+ // Syncs data to disk.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL on I/O error
+ libtextclassifier3::Status PersistToDisk();
+
+ // Deletes all usage data and re-initialize the storage.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status Reset();
+
+ private:
+ explicit UsageStore(std::unique_ptr<FileBackedVector<UsageScores>>
+ document_id_to_scores_mapper,
+ const Filesystem& filesystem, std::string base_dir)
+ : filesystem_(filesystem),
+ base_dir_(std::move(base_dir)),
+ usage_score_cache_(std::move(document_id_to_scores_mapper)) {}
+
+ const Filesystem& filesystem_;
+
+ // Base directory where the files are located.
+ const std::string base_dir_;
+
+ // Used to store the usage scores of documents.
+ std::unique_ptr<FileBackedVector<UsageScores>> usage_score_cache_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_USAGE_STORE_H_
diff --git a/icing/store/usage-store_test.cc b/icing/store/usage-store_test.cc
new file mode 100644
index 0000000..39985f0
--- /dev/null
+++ b/icing/store/usage-store_test.cc
@@ -0,0 +1,389 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/usage-store.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+using ::testing::Not;
+
+class UsageStoreTest : public testing::Test {
+ protected:
+ UsageStoreTest() : test_dir_(GetTestTempDir() + "/usage-store-test") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+};
+
+UsageReport CreateUsageReport(std::string name_space, std::string uri,
+ int64 timestamp_ms,
+ UsageReport::UsageType usage_type) {
+ UsageReport usage_report;
+ usage_report.set_document_namespace(name_space);
+ usage_report.set_document_uri(uri);
+ usage_report.set_usage_timestamp_ms(timestamp_ms);
+ usage_report.set_usage_type(usage_type);
+ return usage_report;
+}
+
+TEST_F(UsageStoreTest, CreationShouldSucceed) {
+ EXPECT_THAT(UsageStore::Create(&filesystem_, test_dir_), IsOk());
+}
+
+TEST_F(UsageStoreTest, CreationShouldFailOnNullPointer) {
+ EXPECT_THAT(UsageStore::Create(nullptr, test_dir_),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(UsageStoreTest, UsageScoresShouldBeComparable) {
+ UsageStore::UsageScores scores1;
+ UsageStore::UsageScores scores2;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type1_last_used_timestamp_s.
+ ++scores1.usage_type1_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type1_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type2_last_used_timestamp_s.
+ ++scores1.usage_type2_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type2_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type3_last_used_timestamp_s.
+ ++scores1.usage_type3_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type3_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type1_count.
+ ++scores1.usage_type1_count;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type1_count;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type2_count.
+ ++scores1.usage_type2_count;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type2_count;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type3_count.
+ ++scores1.usage_type3_count;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type3_count;
+ EXPECT_THAT(scores1, Eq(scores2));
+}
+
+TEST_F(UsageStoreTest, InvalidDocumentIdShouldReturnError) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ DocumentId invalid_document_id = -1;
+
+ EXPECT_THAT(usage_store->AddUsageReport(UsageReport(), invalid_document_id),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(usage_store->DeleteUsageScores(invalid_document_id),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(usage_store->GetUsageScores(invalid_document_id),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(usage_store->SetUsageScores(invalid_document_id,
+ UsageStore::UsageScores()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(UsageStoreTest, AddUsageReportShouldUpdateLastUsedTimestamp) {
+ // Create 3 reports with different timestamps.
+ UsageReport usage_report_time1 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_time5 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/5000, UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_time10 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/10000, UsageReport::USAGE_TYPE1);
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Report a usage with timestamp 5.
+ usage_store->AddUsageReport(usage_report_time5, /*document_id=*/1);
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_last_used_timestamp_s = 5;
+ expected_scores.usage_type1_count = 1;
+ expected_scores.usage_type2_count = 0;
+ expected_scores.usage_type3_count = 0;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+
+ // Report a usage with timestamp 1. The timestamp won't be updated.
+ usage_store->AddUsageReport(usage_report_time1, /*document_id=*/1);
+ ++expected_scores.usage_type1_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+
+ // Report a usage with timestamp 10. The timestamp should be updated.
+ usage_store->AddUsageReport(usage_report_time10, /*document_id=*/1);
+ expected_scores.usage_type1_last_used_timestamp_s = 10;
+ ++expected_scores.usage_type1_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, AddUsageReportShouldUpdateCounts) {
+ // Create 3 reports with different usage types.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_type2 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE2);
+ UsageReport usage_report_type3 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE3);
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Report a usage with type 1.
+ usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1);
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_count = 1;
+ expected_scores.usage_type2_count = 0;
+ expected_scores.usage_type3_count = 0;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+ // Report another usage with type 1.
+ usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1);
+ ++expected_scores.usage_type1_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+
+ // Report a usage with type 2.
+ usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1);
+ ++expected_scores.usage_type2_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+ // Report another usage with type 2.
+ usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1);
+ ++expected_scores.usage_type2_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+
+ // Report a usage with type 3.
+ usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1);
+ ++expected_scores.usage_type3_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+ // Report another usage with type 3.
+ usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1);
+ ++expected_scores.usage_type3_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, GetNonExistingDocumentShouldReturnDefaultScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, SetAndGetUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores;
+ scores.usage_type1_last_used_timestamp_s = 7;
+ scores.usage_type2_last_used_timestamp_s = 9;
+ scores.usage_type3_last_used_timestamp_s = 11;
+ scores.usage_type1_count = 3;
+ scores.usage_type2_count = 4;
+ scores.usage_type3_count = 9;
+
+ // Verify that set and get results are consistent.
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+}
+
+TEST_F(UsageStoreTest, ImplicitlyInitializedScoresShouldBeZero) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Explicitly set scores for document 2.
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/2,
+ UsageStore::UsageScores()));
+
+ // Now the scores of document 1 have been implicitly initialized. The scores
+ // should all be 0.
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_last_used_timestamp_s = 0;
+ expected_scores.usage_type2_last_used_timestamp_s = 0;
+ expected_scores.usage_type3_last_used_timestamp_s = 0;
+ expected_scores.usage_type1_count = 0;
+ expected_scores.usage_type2_count = 0;
+ expected_scores.usage_type3_count = 0;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, DeleteUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores;
+ scores.usage_type1_last_used_timestamp_s = 7;
+ scores.usage_type2_last_used_timestamp_s = 9;
+ scores.usage_type3_last_used_timestamp_s = 11;
+ scores.usage_type1_count = 3;
+ scores.usage_type2_count = 4;
+ scores.usage_type3_count = 9;
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+
+ // Delete the usage scores of document 1, all the scores of document 1 should
+ // be 0.
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_last_used_timestamp_s = 0;
+ expected_scores.usage_type2_last_used_timestamp_s = 0;
+ expected_scores.usage_type3_last_used_timestamp_s = 0;
+ expected_scores.usage_type1_count = 0;
+ expected_scores.usage_type2_count = 0;
+ expected_scores.usage_type3_count = 0;
+ ICING_EXPECT_OK(usage_store->DeleteUsageScores(/*document_id=*/1));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, PersistToDisk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores;
+ scores.usage_type1_last_used_timestamp_s = 7;
+ scores.usage_type2_last_used_timestamp_s = 9;
+ scores.usage_type3_last_used_timestamp_s = 11;
+ scores.usage_type1_count = 3;
+ scores.usage_type2_count = 4;
+ scores.usage_type3_count = 9;
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+
+ EXPECT_THAT(usage_store->PersistToDisk(), IsOk());
+}
+
+TEST_F(UsageStoreTest, Reset) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores;
+ scores.usage_type1_last_used_timestamp_s = 7;
+ scores.usage_type2_last_used_timestamp_s = 9;
+ scores.usage_type3_last_used_timestamp_s = 11;
+ scores.usage_type1_count = 3;
+ scores.usage_type2_count = 4;
+ scores.usage_type3_count = 9;
+
+ // Set scores for document 1 and document 2.
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/2, scores));
+
+ EXPECT_THAT(usage_store->Reset(), IsOk());
+
+ // After resetting, all the scores are cleared.
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_last_used_timestamp_s = 0;
+ expected_scores.usage_type2_last_used_timestamp_s = 0;
+ expected_scores.usage_type3_last_used_timestamp_s = 0;
+ expected_scores.usage_type1_count = 0;
+ expected_scores.usage_type2_count = 0;
+ expected_scores.usage_type3_count = 0;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/2),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, TimestampInSecondsShouldNotOverflow) {
+ // Create a report with the max value of timestamps.
+ UsageReport usage_report = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/std::numeric_limits<int64>::max(),
+ UsageReport::USAGE_TYPE1);
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // The stored timestamp in seconds should be the max value of uint32.
+ usage_store->AddUsageReport(usage_report, /*document_id=*/1);
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_last_used_timestamp_s =
+ std::numeric_limits<uint32_t>::max();
+ expected_scores.usage_type1_count = 1;
+ expected_scores.usage_type2_count = 0;
+ expected_scores.usage_type3_count = 0;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, CountsShouldNotOverflow) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with the max value of int.
+ UsageStore::UsageScores scores;
+ scores.usage_type1_last_used_timestamp_s = 0;
+ scores.usage_type2_last_used_timestamp_s = 0;
+ scores.usage_type3_last_used_timestamp_s = 0;
+ scores.usage_type1_count = std::numeric_limits<int>::max();
+ scores.usage_type2_count = 0;
+ scores.usage_type3_count = 0;
+
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+ ASSERT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+
+ // Report another usage with type 1.
+ UsageReport usage_report = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1);
+ usage_store->AddUsageReport(usage_report, /*document_id=*/1);
+
+ // usage_type1_count should not change because it's already the max value.
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/icu/icu-language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc
index 0ef1824..9213fbe 100644
--- a/icing/tokenization/icu/icu-language-segmenter-factory.cc
+++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc
@@ -15,6 +15,7 @@
#include "icing/tokenization/icu/icu-language-segmenter.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 31c2726..d0b90d1 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -409,6 +409,71 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
EXPECT_THAT(word2_address, Eq(word2_result_address));
}
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance()); // itr points to 'How'
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ IteratorMultipleAdvancesResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance()); // itr points to ' '
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ while (itr->Advance()) {
+ // Do nothing.
+ }
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
language_segmenter_factory::Create(GetOptions()));
@@ -992,6 +1057,19 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
}
+TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) {
+ ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
+ language_segmenter_factory::Create(GetOptions()));
+ // Validates that the input strings are not copied
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(
+ "(-term1 OR term2) AND property1.subproperty2:term3"));
+ EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2",
+ ")", " ", "AND", " ", "property1", ".",
+ "subproperty2", ":", "term3"));
+}
+
INSTANTIATE_TEST_SUITE_P(
LocaleName, IcuLanguageSegmenterAllLocalesTest,
testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index ce50d0b..e60c168 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -18,11 +18,14 @@
#include <memory>
#include <string_view>
+#ifdef __ANDROID__
#include "icing/jni/jni-cache.h"
+#else // __ANDROID__
+class JniCache; // forward declaration to let non-Android builds work.
+#endif // __ANDROID__
+
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
-#include "icing/util/i18n-utils.h"
-#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -30,7 +33,7 @@ namespace lib {
namespace language_segmenter_factory {
struct SegmenterOptions {
- explicit SegmenterOptions(std::string locale = ULOC_US,
+ explicit SegmenterOptions(std::string locale,
const JniCache* jni_cache = nullptr)
: locale(std::move(locale)), jni_cache(jni_cache) {}
@@ -46,7 +49,7 @@ struct SegmenterOptions {
// A LanguageSegmenter on success
// INVALID_ARGUMENT if locale string is invalid
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
- SegmenterOptions options = SegmenterOptions());
+ SegmenterOptions options);
} // namespace language_segmenter_factory
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index c7b068d..a1b031a 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -43,8 +43,10 @@ class LanguageSegmenterIteratorTest : public testing::Test {
};
TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -62,8 +64,10 @@ TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithOffsetInText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -77,8 +81,10 @@ TEST_F(LanguageSegmenterIteratorTest,
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithNegativeOffsetNotOk) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -95,8 +101,10 @@ TEST_F(LanguageSegmenterIteratorTest,
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
@@ -106,8 +114,10 @@ TEST_F(LanguageSegmenterIteratorTest,
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
@@ -115,8 +125,10 @@ TEST_F(LanguageSegmenterIteratorTest,
}
TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -130,8 +142,10 @@ TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermEndingBeforeWithZeroNotFound) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -142,8 +156,10 @@ TEST_F(LanguageSegmenterIteratorTest,
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -157,8 +173,10 @@ TEST_F(LanguageSegmenterIteratorTest,
TEST_F(LanguageSegmenterIteratorTest,
ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index 49ddfca..bd86169 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -20,6 +20,7 @@
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
@@ -59,8 +60,9 @@ void BM_SegmentNoSpace(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string(state.range(0), 'A');
@@ -95,8 +97,9 @@ void BM_SegmentWithSpaces(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string(state.range(0), 'A');
for (int i = 1; i < input_string.length(); i += 2) {
@@ -134,8 +137,9 @@ void BM_SegmentCJK(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string;
while (input_string.length() < state.range(0)) {
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index f2fc678..d9db75a 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -24,6 +24,7 @@
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -49,8 +50,10 @@ TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) {
}
TEST_F(PlainTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -81,8 +84,10 @@ TEST_F(PlainTokenizerTest, Simple) {
}
TEST_F(PlainTokenizerTest, Whitespace) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -107,8 +112,10 @@ TEST_F(PlainTokenizerTest, Whitespace) {
}
TEST_F(PlainTokenizerTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -136,8 +143,10 @@ TEST_F(PlainTokenizerTest, Punctuation) {
}
TEST_F(PlainTokenizerTest, SpecialCharacters) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -157,8 +166,10 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) {
}
TEST_F(PlainTokenizerTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -209,8 +220,10 @@ TEST_F(PlainTokenizerTest, CJKT) {
}
TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -226,8 +239,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
}
TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -243,8 +258,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
}
TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
@@ -291,8 +308,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
}
TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 351f7c1..9b71e8a 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -22,6 +22,7 @@
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -46,8 +47,10 @@ TEST_F(RawQueryTokenizerTest, CreationWithNullPointerShouldFail) {
}
TEST_F(RawQueryTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -59,8 +62,10 @@ TEST_F(RawQueryTokenizerTest, Simple) {
}
TEST_F(RawQueryTokenizerTest, Parentheses) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -159,8 +164,10 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
}
TEST_F(RawQueryTokenizerTest, Exclustion) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -226,8 +233,10 @@ TEST_F(RawQueryTokenizerTest, Exclustion) {
}
TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -314,8 +323,10 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
}
TEST_F(RawQueryTokenizerTest, OR) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -435,8 +446,10 @@ TEST_F(RawQueryTokenizerTest, OR) {
// CJKT are treated the same way by language segmenter and raw tokenizer, so
// here we test Chinese and Japanese to represent CJKT.
TEST_F(RawQueryTokenizerTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -488,8 +501,10 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
// Raw tokenizer identifies all characters that it doesn't know as OTHER type,
// so we can choose comma "," to represent all OTHER characters.
TEST_F(RawQueryTokenizerTest, OtherChars) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
@@ -533,8 +548,10 @@ TEST_F(RawQueryTokenizerTest, OtherChars) {
}
TEST_F(RawQueryTokenizerTest, Mix) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
index f79bc68..db973f3 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include "icing/jni/jni-cache.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
index a01d944..4b50231 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
@@ -443,6 +443,74 @@ TEST_P(ReverseJniLanguageSegmenterTest, NotCopyStrings) {
EXPECT_THAT(word2_address, Eq(word2_result_address));
}
+TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance()); // itr points to 'How'
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorMultipleAdvancesResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance()); // itr points to ' '
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // Bytes: 0 3 4 7 8 11 172023 29 35
+ while (itr->Advance()) {
+ // Do nothing.
+ }
+ EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
@@ -1060,6 +1128,21 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) {
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
}
+TEST_P(ReverseJniLanguageSegmenterTest, QuerySyntax) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Validates that the input strings are not copied
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(
+ "(-term1 OR term2) AND property1.subproperty2:term3"));
+ EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2",
+ ")", " ", "AND", " ", "property1", ".",
+ "subproperty2", ":", "term3"));
+}
+
INSTANTIATE_TEST_SUITE_P(
LocaleName, ReverseJniLanguageSegmenterTest,
testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index 2256022..bb26364 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -24,164 +24,13 @@
#include "icing/absl_ports/canonical_errors.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
-namespace {
-
-// Returns the lead byte of the UTF-8 character that includes the byte at
-// current_byte_index within it.
-int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
- while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
- --current_byte_index;
- }
- return current_byte_index;
-}
-
-class CharacterIterator {
- public:
- explicit CharacterIterator(std::string_view text)
- : CharacterIterator(text, 0, 0) {}
- CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
- : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
-
- // Moves from current position to the character that includes the specified
- // UTF-8 index.
- // REQUIRES: desired_utf8_index <= text_.length()
- // desired_utf8_index is allowed to point one index past the end, but no
- // further.
- bool AdvanceToUtf8(int desired_utf8_index) {
- if (desired_utf8_index > text_.length()) {
- // Enforce the requirement.
- return false;
- }
- // Need to work forwards.
- while (utf8_index_ < desired_utf8_index) {
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- if (utf8_index_ + utf8_length > desired_utf8_index) {
- // Ah! Don't go too far!
- break;
- }
- utf8_index_ += utf8_length;
- utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- // Moves from current position to the character that includes the specified
- // UTF-8 index.
- // REQUIRES: 0 <= desired_utf8_index
- bool RewindToUtf8(int desired_utf8_index) {
- if (desired_utf8_index < 0) {
- // Enforce the requirement.
- return false;
- }
- // Need to work backwards.
- while (utf8_index_ > desired_utf8_index) {
- --utf8_index_;
- utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
- if (utf8_index_ < 0) {
- // Somehow, there wasn't a single UTF-8 lead byte at
- // requested_byte_index or an earlier byte.
- return false;
- }
- // We've found the start of a unicode char!
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- // Advances current position to desired_utf16_index.
- // REQUIRES: desired_utf16_index <= text_.utf16_length()
- // desired_utf16_index is allowed to point one index past the end, but no
- // further.
- bool AdvanceToUtf16(int desired_utf16_index) {
- while (utf16_index_ < desired_utf16_index) {
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- int utf16_length = i18n_utils::GetUtf16Length(uchar32);
- if (utf16_index_ + utf16_length > desired_utf16_index) {
- // Ah! Don't go too far!
- break;
- }
- int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- if (utf8_index_ + utf8_length > text_.length()) {
- // Enforce the requirement.
- return false;
- }
- utf8_index_ += utf8_length;
- utf16_index_ += utf16_length;
- }
- return true;
- }
-
- // Rewinds current position to desired_utf16_index.
- // REQUIRES: 0 <= desired_utf16_index
- bool RewindToUtf16(int desired_utf16_index) {
- if (desired_utf16_index < 0) {
- return false;
- }
- while (utf16_index_ > desired_utf16_index) {
- --utf8_index_;
- utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
- // We've found the start of a unicode char!
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- bool IsValidCharacter() const {
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[utf8_index_])) {
- return true;
- }
-
- // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
- // We know it's an alphabetic term by checking the first unicode character.
- if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) {
- return true;
- }
-
- return false;
- }
-
- int utf8_index() const { return utf8_index_; }
- int utf16_index() const { return utf16_index_; }
-
- private:
- std::string_view text_;
- int utf8_index_;
- int utf16_index_;
-};
-
-} // namespace
-
class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
public:
explicit ReverseJniLanguageSegmenterIterator(
@@ -229,7 +78,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Check if the current term is valid. We consider any term valid if its
// first character is valid. If it's not valid, then we need to advance to
// the next term.
- if (term_start_.IsValidCharacter()) {
+ if (IsValidTerm()) {
return true;
}
return Advance();
@@ -382,8 +231,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// 4. The start and end indices point to a segment, but we need to ensure
// that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
// need a segment prior to this one.
- if (term_end_exclusive_.utf8_index() > offset ||
- !term_start_.IsValidCharacter()) {
+ if (term_end_exclusive_.utf8_index() > offset || !IsValidTerm()) {
return ResetToTermEndingBefore(term_start_.utf8_index());
}
return term_start_.utf8_index();
@@ -414,6 +262,21 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
/*utf16_index=*/ReverseJniBreakIterator::kDone);
}
+ bool IsValidTerm() const {
+ // Rule 1: all ASCII terms will be returned.
+ // We know it's a ASCII term by checking the first char.
+ if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) {
+ return true;
+ }
+
+ // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
+ // We know it's an alphabetic term by checking the first unicode character.
+ if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) {
+ return true;
+ }
+ return false;
+ }
+
// All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
// this class needs to maintain state to convert between UTF-16 and UTF-8.
std::unique_ptr<ReverseJniBreakIterator> break_iterator_;
diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
index 8ed38b2..6c5e3f6 100644
--- a/icing/tokenization/simple/space-language-segmenter_test.cc
+++ b/icing/tokenization/simple/space-language-segmenter_test.cc
@@ -18,6 +18,7 @@
#include "icing/testing/common-matchers.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -28,21 +29,27 @@ using ::testing::Eq;
using ::testing::IsEmpty;
TEST(SpaceLanguageSegmenterTest, EmptyText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
}
TEST(SpaceLanguageSegmenterTest, SimpleText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
IsOkAndHolds(ElementsAre("Hello", " ", "World")));
}
TEST(SpaceLanguageSegmenterTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
@@ -55,8 +62,10 @@ TEST(SpaceLanguageSegmenterTest, Punctuation) {
}
TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
// Alphanumeric terms are allowed
EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
@@ -64,8 +73,10 @@ TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
}
TEST(SpaceLanguageSegmenterTest, Number) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
// Alphanumeric terms are allowed
EXPECT_THAT(
@@ -80,8 +91,10 @@ TEST(SpaceLanguageSegmenterTest, Number) {
}
TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
// Multiple continuous whitespaces are treated as one.
const int kNumSeparators = 256;
@@ -92,8 +105,10 @@ TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
}
TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
// Validates that the input strings are not copied
const std::string text = "Hello World";
const char* word1_address = text.c_str();
diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc
new file mode 100644
index 0000000..3707f95
--- /dev/null
+++ b/icing/util/character-iterator.cc
@@ -0,0 +1,127 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/character-iterator.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Returns the lead byte of the UTF-8 character that includes the byte at
+// current_byte_index within it.
+int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
+ while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
+ --current_byte_index;
+ }
+ return current_byte_index;
+}
+
+} // namespace
+
+bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work forwards.
+ while (utf8_index_ < desired_utf8_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > desired_utf8_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+}
+
+bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index < 0) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work backwards.
+ while (utf8_index_ > desired_utf8_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ if (utf8_index_ < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ return false;
+ }
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+}
+
+bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
+ while (utf16_index_ < desired_utf16_index) {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+ if (utf16_index_ + utf16_length > desired_utf16_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += utf16_length;
+ }
+ return true;
+}
+
+bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
+ if (desired_utf16_index < 0) {
+ return false;
+ }
+ while (utf16_index_ > desired_utf16_index) {
+ --utf8_index_;
+ utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
+ // We've found the start of a unicode char!
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ return false;
+ }
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ }
+ return true;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h
new file mode 100644
index 0000000..22de6c5
--- /dev/null
+++ b/icing/util/character-iterator.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_CHARACTER_ITERATOR_H_
+#define ICING_UTIL_CHARACTER_ITERATOR_H_
+
+#include "icing/util/i18n-utils.h"
+
+namespace icing {
+namespace lib {
+
+class CharacterIterator {
+ public:
+ explicit CharacterIterator(std::string_view text)
+ : CharacterIterator(text, 0, 0) {}
+
+ CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
+ : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
+
+ // Moves from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: desired_utf8_index <= text_.length()
+ // desired_utf8_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf8(int desired_utf8_index);
+
+ // Moves from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: 0 <= desired_utf8_index
+ bool RewindToUtf8(int desired_utf8_index);
+
+ // Advances current position to desired_utf16_index.
+ // REQUIRES: desired_utf16_index <= text_.utf16_length()
+ // desired_utf16_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf16(int desired_utf16_index);
+
+ // Rewinds current position to desired_utf16_index.
+ // REQUIRES: 0 <= desired_utf16_index
+ bool RewindToUtf16(int desired_utf16_index);
+
+ int utf8_index() const { return utf8_index_; }
+ int utf16_index() const { return utf16_index_; }
+
+ bool operator==(const CharacterIterator& rhs) const {
+ return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ &&
+ utf16_index_ == rhs.utf16_index_;
+ }
+
+ private:
+ std::string_view text_;
+ int utf8_index_;
+ int utf16_index_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_CHARACTER_ITERATOR_H_
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index 9cf992f..d6754d5 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -99,16 +99,17 @@ void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
return;
}
- while (truncate_to_length > 0) {
- if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
- str->resize(truncate_to_length);
- return;
+ str->resize(SafeTruncateUtf8Length(str->c_str(), truncate_to_length));
+}
+
+int SafeTruncateUtf8Length(const char* str, int desired_length) {
+ while (desired_length > 0) {
+ if (IsLeadUtf8Byte(str[desired_length])) {
+ break;
}
- truncate_to_length--;
+ --desired_length;
}
-
- // Truncates to an empty string
- str->resize(0);
+ return desired_length;
}
bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index e103bab..82ae828 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -50,6 +50,13 @@ libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
// Returns the char at the given position.
UChar32 GetUChar32At(const char* data, int length, int position);
+// Returns the safe position to truncate a UTF8 string at so that multi-byte
+// UTF8 characters are not cut in the middle. The returned value will always be
+// 0 <= val <= desired_length.
+//
+// REQUIRES: 0 <= desired_length < strlen(str)
+int SafeTruncateUtf8Length(const char* str, int desired_length);
+
// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
// in the middle. The string will be truncated in place.
void SafeTruncateUtf8(std::string* str, int truncate_to_length);
diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java
index 7be631c..76fa33d 100644
--- a/java/src/com/google/android/icing/IcingSearchEngine.java
+++ b/java/src/com/google/android/icing/IcingSearchEngine.java
@@ -328,6 +328,27 @@ public final class IcingSearchEngine {
}
@NonNull
+ public DeleteResultProto deleteByQuery(@NonNull SearchSpecProto searchSpec) {
+ byte[] deleteResultBytes = nativeDeleteByQuery(nativePointer, searchSpec.toByteArray());
+ if (deleteResultBytes == null) {
+ Log.e(TAG, "Received null DeleteResultProto from native.");
+ return DeleteResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return DeleteResultProto.parseFrom(
+ deleteResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing DeleteResultProto.", e);
+ return DeleteResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
public PersistToDiskResultProto persistToDisk() {
byte[] persistToDiskResultBytes = nativePersistToDisk(nativePointer);
if (persistToDiskResultBytes == null) {
@@ -438,6 +459,8 @@ public final class IcingSearchEngine {
private static native byte[] nativeDeleteBySchemaType(long nativePointer, String schemaType);
+ private static native byte[] nativeDeleteByQuery(long nativePointer, byte[] searchSpecBytes);
+
private static native byte[] nativePersistToDisk(long nativePointer);
private static native byte[] nativeOptimize(long nativePointer);
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index d907d4e..fb77d6e 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -335,6 +335,58 @@ public final class IcingSearchEngineTest {
assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
}
+
+ @Test
+ public void testDeleteByQuery() throws Exception {
+ IcingSearchEngineOptions options =
+ IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
+ IcingSearchEngine icing = new IcingSearchEngine(options);
+ assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icing
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
+ .build();;
+ assertThat(icing.put(emailDocument1).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ DocumentProto emailDocument2 =
+ createEmailDocument("namespace", "uri2").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("bar"))
+ .build();;
+ assertThat(icing.put(emailDocument2).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("foo")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+
+ SearchResultProto searchResultProto =
+ icing.search(
+ searchSpec,
+ ScoringSpecProto.getDefaultInstance(),
+ ResultSpecProto.getDefaultInstance());
+ assertThat(searchResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+ assertThat(searchResultProto.getResults(0).getDocument()).isEqualTo(emailDocument1);
+
+ DeleteResultProto deleteResultProto = icing.deleteByQuery(searchSpec);
+ assertThat(deleteResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+
+ GetResultProto getResultProto = icing.get("namespace", "uri1");
+ assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
+ getResultProto = icing.get("namespace", "uri2");
+ assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ }
+
@Test
public void testPersistToDisk() throws Exception {
IcingSearchEngineOptions options =
diff --git a/proto/icing/proto/status.proto b/proto/icing/proto/status.proto
index 2733a15..08677b0 100644
--- a/proto/icing/proto/status.proto
+++ b/proto/icing/proto/status.proto
@@ -24,7 +24,7 @@ option objc_class_prefix = "ICNG";
// Canonical status to indicate the results of API calls.
// Next tag: 3
message StatusProto {
- // Next tag: 9
+ // Next tag: 10
enum Code {
// A default for all other use-cases. Should never be used in practice. This
// may happen if there are backwards-compatibility issues.
@@ -62,6 +62,12 @@ message StatusProto {
// make some space on the underlying filesystem.
OUT_OF_SPACE = 8;
+ // An operation is invalid because the resource already exists and can't be
+ // replaced. For example, this status is used when a SchemaProto contains
+ // multiple definitions of the same type or multiple properties with the
+ // same name within a type.
+ ALREADY_EXISTS = 9;
+
// Any future status codes.
}
optional Code code = 1;
diff --git a/proto/icing/proto/usage.proto b/proto/icing/proto/usage.proto
new file mode 100644
index 0000000..81243f0
--- /dev/null
+++ b/proto/icing/proto/usage.proto
@@ -0,0 +1,53 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+option java_package = "com.google.android.icing.proto";
+option java_multiple_files = true;
+
+option objc_class_prefix = "ICNG";
+
+// Representation of a usage report that is generated from the client and sent
+// to Icing.
+// Next tag: 5
+// LINT.IfChange
+message UsageReport {
+ // Namespace of the document.
+ optional string document_namespace = 1;
+
+ // Uri of the document.
+ optional string document_uri = 2;
+
+ // Timestamp in milliseconds of when the usage happens.
+ optional int64 usage_timestamp_ms = 3;
+
+ // Next tag: 3
+ enum UsageType {
+ // A custom usage type that clients can assign a meaning to. UsageReports of
+ // the same type are combined to provide usage counts that clients may use
+ // in scoring.
+ USAGE_TYPE1 = 0;
+
+ // Same as above.
+ USAGE_TYPE2 = 1;
+
+ // Same as above.
+ USAGE_TYPE3 = 2;
+ }
+ optional UsageType usage_type = 4;
+}
+// LINT.ThenChange(//depot/google3/icing/store/usage-store.h:UsageScores)