diff options
77 files changed, 8363 insertions, 665 deletions
diff --git a/build.gradle b/build.gradle index 3901078..6d13dc2 100644 --- a/build.gradle +++ b/build.gradle @@ -21,7 +21,7 @@ import static androidx.build.dependencies.DependenciesKt.* buildscript { dependencies { - classpath('gradle.plugin.com.google.protobuf:protobuf-gradle-plugin:0.8.8') + classpath('gradle.plugin.com.google.protobuf:protobuf-gradle-plugin:0.8.13') classpath('org.anarres.jarjar:jarjar-gradle:1.0.1') } } @@ -88,7 +88,11 @@ android.libraryVariants.all { variant -> def suffix = variantName.capitalize() def jarjarTask = tasks.create("jarjar${suffix}", JarjarTask) { destinationName "icing-java-${variantName}-jarjar.jar" - from 'com.google.protobuf:protobuf-javalite:3.10.0' + + + dependsOn protoLiteJarWithoutProtoFiles + from files(protoLiteJarWithoutProtoFiles.archiveFile.get().getAsFile()) + from files(variant.javaCompileProvider.get().destinationDir) dependsOn variant.javaCompileProvider.get() classRename 'com.google.protobuf.**', 'com.google.android.icing.protobuf.@1' @@ -101,3 +105,20 @@ android.libraryVariants.all { variant -> builtBy jarjarTask } } + +// The proto-lite dependency includes .proto files, which are not used by icing. When apps depend on +// appsearch as well as proto-lite directly, these files conflict since jarjar only renames the java +// classes. Remove them here since they are unused. +tasks.register("protoLiteJarWithoutProtoFiles", Jar){ + // Get proto lite dependency as a jar file: + def jarFile = configurations.detachedConfiguration( + dependencies.create('com.google.protobuf:protobuf-javalite:3.10.0')).getSingleFile() + + // Expand the jar and remove any .proto files. + from(zipTree(jarFile)) { + exclude("**/*.proto") + } + + into 'icing-proto-lite-dep-stripped' +} + diff --git a/icing/absl_ports/annotate.cc b/icing/absl_ports/annotate.cc index d283e13..dfe5566 100644 --- a/icing/absl_ports/annotate.cc +++ b/icing/absl_ports/annotate.cc @@ -33,7 +33,7 @@ libtextclassifier3::Status Annotate(const libtextclassifier3::Status& s, std::string new_msg = (!s.error_message().empty()) - ? absl_ports::StrCat(s.error_message(), kErrorSeparator, msg) + ? absl_ports::StrCat(msg, kErrorSeparator, s.error_message()) : std::string(msg); return libtextclassifier3::Status(s.CanonicalCode(), new_msg); } diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h index 62943b8..95511ac 100644 --- a/icing/file/file-backed-proto-log.h +++ b/icing/file/file-backed-proto-log.h @@ -78,6 +78,23 @@ namespace icing { namespace lib { +namespace { + +bool IsEmptyBuffer(const char* buffer, int size) { + return std::all_of(buffer, buffer + size, + [](const char byte) { return byte == 0; }); +} + +// Helper function to get stored proto size from the metadata. +// Metadata format: 8 bits magic + 24 bits size +int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } + +// Helper function to get stored proto magic from the metadata. +// Metadata format: 8 bits magic + 24 bits size +uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } + +} // namespace + template <typename ProtoT> class FileBackedProtoLog { public: @@ -206,10 +223,19 @@ class FileBackedProtoLog { // // Returns: // A proto on success + // NOT_FOUND if the proto at the given offset has been erased // OUT_OF_RANGE_ERROR if file_offset exceeds file size // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const; + // Erases the data of a proto located at file_offset from the file. + // + // Returns: + // OK on success + // OUT_OF_RANGE_ERROR if file_offset exceeds file size + // INTERNAL_ERROR on IO error + libtextclassifier3::Status EraseProto(int64_t file_offset); + // Calculates and returns the disk usage in bytes. Rounds up to the nearest // block size. // @@ -239,7 +265,7 @@ class FileBackedProtoLog { Iterator(const Filesystem& filesystem, const std::string& file_path, int64_t initial_offset); - // Advances to the position of next proto. + // Advances to the position of next proto whether it has been erased or not. // // Returns: // OK on success @@ -716,10 +742,15 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto( int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size)); // Copy out however many bytes it says the proto is - int stored_size = metadata & 0x00FFFFFF; + int stored_size = GetProtoSize(metadata); ICING_RETURN_IF_ERROR( mmapped_file.Remap(file_offset + sizeof(metadata), stored_size)); + + if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) { + return absl_ports::NotFoundError("The proto data has been erased."); + } + google::protobuf::io::ArrayInputStream proto_stream( mmapped_file.mutable_region(), stored_size); @@ -736,6 +767,62 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto( } template <typename ProtoT> +libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto( + int64_t file_offset) { + int64_t file_size = filesystem_->GetFileSize(fd_.get()); + if (file_offset >= file_size) { + // file_size points to the next byte to write at, so subtract one to get the + // inclusive, actual size of file. + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Trying to erase data at a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + + MemoryMappedFile mmapped_file( + *filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); + + // Read out the metadata + ICING_ASSIGN_OR_RETURN( + int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + + ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), + GetProtoSize(metadata))); + + // We need to update the crc checksum if the erased area is before the rewind + // position. + if (file_offset + sizeof(metadata) < header_->rewind_offset) { + // We need to calculate [original string xor 0s]. + // The xored string is the same as the original string because 0 xor 0 = 0, + // 1 xor 0 = 1. + const std::string_view xored_str(mmapped_file.region(), + mmapped_file.region_size()); + + Crc32 crc(header_->log_checksum); + ICING_ASSIGN_OR_RETURN( + uint32_t new_crc, + crc.UpdateWithXor( + xored_str, + /*full_data_size=*/header_->rewind_offset - sizeof(Header), + /*position=*/file_offset + sizeof(metadata) - sizeof(Header))); + + header_->log_checksum = new_crc; + header_->header_checksum = header_->CalculateHeaderChecksum(); + + if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), + sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to update header to: ", file_path_)); + } + } + + memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size()); + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage() const { int64_t size = filesystem_->GetDiskUsage(file_path_.c_str()); @@ -781,8 +868,7 @@ libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() { ICING_ASSIGN_OR_RETURN( int metadata, ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_)); - int proto_size = metadata & 0x00FFFFFF; - current_offset_ += sizeof(metadata) + proto_size; + current_offset_ += sizeof(metadata) + GetProtoSize(metadata); } if (current_offset_ < file_size_) { @@ -829,7 +915,7 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata( ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size)); memcpy(&metadata, mmapped_file->region(), metadata_size); // Checks magic number - uint8_t stored_k_proto_magic = metadata >> 24; + uint8_t stored_k_proto_magic = GetProtoMagic(metadata); if (stored_k_proto_magic != kProtoMagic) { return absl_ports::InternalError(IcingStringUtil::StringPrintf( "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic, @@ -842,7 +928,7 @@ template <typename ProtoT> libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() { int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); if (file_size == header_->rewind_offset) { - // No changes made, don't need to update the checksum. + // No new protos appended, don't need to update the checksum. return libtextclassifier3::Status::OK; } diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc index 3a9060d..fad5248 100644 --- a/icing/file/file-backed-proto-log_test.cc +++ b/icing/file/file-backed-proto-log_test.cc @@ -48,7 +48,10 @@ class FileBackedProtoLogTest : public ::testing::Test { // https://stackoverflow.com/a/47368753 FileBackedProtoLogTest() {} - void SetUp() override { file_path_ = GetTestTempDir() + "/proto_log"; } + void SetUp() override { + file_path_ = GetTestTempDir() + "/proto_log"; + filesystem_.DeleteFile(file_path_.c_str()); + } void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); } @@ -93,7 +96,7 @@ TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); @@ -110,7 +113,7 @@ TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); // Write a proto DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); @@ -144,7 +147,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) { FileBackedProtoLog<DocumentProto>::Options( /*compress_in=*/false, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); // Write the first proto DocumentProto document1 = @@ -191,7 +194,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) { FileBackedProtoLog<DocumentProto>::Options( /*compress_in=*/false, max_proto_size_))); auto recreated_proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); // Write a third proto DocumentProto document3 = @@ -213,7 +216,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) { FileBackedProtoLog<DocumentProto>::Options( /*compress_in=*/true, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); // Write the first proto DocumentProto document1 = @@ -260,7 +263,7 @@ TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) { FileBackedProtoLog<DocumentProto>::Options( /*compress_in=*/true, max_proto_size_))); auto recreated_proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); // Write a third proto DocumentProto document3 = @@ -360,7 +363,7 @@ TEST_F(FileBackedProtoLogTest, PersistToDisk) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); // Write and persist the first proto ICING_ASSERT_OK_AND_ASSIGN(document1_offset, @@ -430,7 +433,7 @@ TEST_F(FileBackedProtoLogTest, Iterator) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); { // Empty iterator @@ -481,7 +484,7 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); ICING_EXPECT_OK(proto_log->WriteProto(document)); @@ -499,7 +502,7 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) { FileBackedProtoLog<DocumentProto>::Options(compress_, max_proto_size_))); auto proto_log = std::move(create_result.proto_log); - EXPECT_FALSE(create_result.data_loss); + ASSERT_FALSE(create_result.data_loss); // Checksum should be consistent across instances EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); @@ -514,6 +517,166 @@ TEST_F(FileBackedProtoLogTest, ComputeChecksum) { } } +TEST_F(FileBackedProtoLogTest, EraseProtoShouldSetZero) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.data_loss); + + // Writes and erases proto + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + // Checks if the erased area is set to 0. + int64_t file_size = filesystem_.GetFileSize(file_path_.c_str()); + MemoryMappedFile mmapped_file(filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_ONLY); + + // document1_offset + sizeof(int) is the start byte of the proto where + // sizeof(int) is the size of the proto metadata. + mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1); + for (size_t i = 0; i < mmapped_file.region_size(); ++i) { + ASSERT_THAT(mmapped_file.region()[i], Eq(0)); + } +} + +TEST_F(FileBackedProtoLogTest, EraseProtoShouldReturnNotFound) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.data_loss); + + // Writes 2 protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset, + proto_log->WriteProto(document2)); + + // Erases the first proto + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + // The first proto has been erased. + ASSERT_THAT(proto_log->ReadProto(document1_offset), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + // The second proto should be returned. + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); +} + +TEST_F(FileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + DocumentProto document3 = + DocumentBuilder().SetKey("namespace", "uri3").Build(); + DocumentProto document4 = + DocumentBuilder().SetKey("namespace", "uri4").Build(); + + int64_t document2_offset; + int64_t document3_offset; + + { + // Erase data after the rewind position. This won't update the checksum + // immediately. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.data_loss); + + // Writes 3 protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK_AND_ASSIGN(document2_offset, + proto_log->WriteProto(document2)); + ICING_ASSERT_OK_AND_ASSIGN(document3_offset, + proto_log->WriteProto(document3)); + + // Erases the 1st proto, checksum won't be updated immediately because the + // rewind position is 0. + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(2293202502)))); + } // New checksum is updated in destructor. + + { + // Erase data before the rewind position. This will update the checksum + // immediately. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.data_loss); + + // Erases the 2nd proto that is now before the rewind position. Checksum is + // updated. + ICING_ASSERT_OK(proto_log->EraseProto(document2_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(639634028)))); + } + + { + // Append data and erase data before the rewind position. This will update + // the checksum twice: in EraseProto() and destructor. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.data_loss); + + // Append a new document which is after the rewind position. + ICING_ASSERT_OK(proto_log->WriteProto(document4)); + + // Erases the 3rd proto that is now before the rewind position. Checksum is + // updated. + ICING_ASSERT_OK(proto_log->EraseProto(document3_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(1990198693)))); + } // Checksum is updated with the newly appended document. + + { + // A successful creation means that the checksum matches. + ICING_ASSERT_OK_AND_ASSIGN( + FileBackedProtoLog<DocumentProto>::CreateResult create_result, + FileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + FileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.data_loss); + } +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h index e4ec0cd..eb89db8 100644 --- a/icing/file/file-backed-vector.h +++ b/icing/file/file-backed-vector.h @@ -187,7 +187,7 @@ class FileBackedVector { // // Returns: // OUT_OF_RANGE_ERROR if len < 0 or >= num_elements() - libtextclassifier3::Status TruncateTo(int32_t len); + libtextclassifier3::Status TruncateTo(int32_t new_num_elements); // Flushes content to underlying file. // diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index c973885..5e0a46e 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -59,6 +59,7 @@ #include "icing/util/crc32.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -148,30 +149,31 @@ std::string MakeSchemaDirectoryPath(const std::string& base_dir) { void TransformStatus(const libtextclassifier3::Status& internal_status, StatusProto* status_proto) { + StatusProto::Code code; switch (internal_status.CanonicalCode()) { case libtextclassifier3::StatusCode::OK: - status_proto->set_code(StatusProto::OK); + code = StatusProto::OK; break; case libtextclassifier3::StatusCode::DATA_LOSS: - status_proto->set_code(StatusProto::WARNING_DATA_LOSS); + code = StatusProto::WARNING_DATA_LOSS; break; case libtextclassifier3::StatusCode::INVALID_ARGUMENT: - status_proto->set_code(StatusProto::INVALID_ARGUMENT); + code = StatusProto::INVALID_ARGUMENT; break; case libtextclassifier3::StatusCode::NOT_FOUND: - status_proto->set_code(StatusProto::NOT_FOUND); + code = StatusProto::NOT_FOUND; break; case libtextclassifier3::StatusCode::FAILED_PRECONDITION: - status_proto->set_code(StatusProto::FAILED_PRECONDITION); + code = StatusProto::FAILED_PRECONDITION; break; case libtextclassifier3::StatusCode::ABORTED: - status_proto->set_code(StatusProto::ABORTED); + code = StatusProto::ABORTED; break; case libtextclassifier3::StatusCode::INTERNAL: // TODO(b/147699081): Cleanup our internal use of INTERNAL since it // doesn't match with what it *should* indicate as described in // go/icing-library-apis. - status_proto->set_code(StatusProto::INTERNAL); + code = StatusProto::INTERNAL; break; case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED: // TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE @@ -179,17 +181,35 @@ void TransformStatus(const libtextclassifier3::Status& internal_status, // internally to indicate other resources are exhausted (e.g. // DocHitInfos) - although none of these are exposed through the API. // Consider separating the two cases out more clearly. - status_proto->set_code(StatusProto::OUT_OF_SPACE); + code = StatusProto::OUT_OF_SPACE; break; - default: + case libtextclassifier3::StatusCode::ALREADY_EXISTS: + code = StatusProto::ALREADY_EXISTS; + break; + case libtextclassifier3::StatusCode::CANCELLED: + [[fallthrough]]; + case libtextclassifier3::StatusCode::UNKNOWN: + [[fallthrough]]; + case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED: + [[fallthrough]]; + case libtextclassifier3::StatusCode::PERMISSION_DENIED: + [[fallthrough]]; + case libtextclassifier3::StatusCode::OUT_OF_RANGE: + [[fallthrough]]; + case libtextclassifier3::StatusCode::UNIMPLEMENTED: + [[fallthrough]]; + case libtextclassifier3::StatusCode::UNAVAILABLE: + [[fallthrough]]; + case libtextclassifier3::StatusCode::UNAUTHENTICATED: // Other internal status codes aren't supported externally yet. If it // should be supported, add another switch-case above. - ICING_LOG(FATAL) << IcingStringUtil::StringPrintf( + ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( "Internal status code %d not supported in the external API", internal_status.error_code()); + code = StatusProto::UNKNOWN; break; } - + status_proto->set_code(code); status_proto->set_message(internal_status.error_message()); } @@ -681,12 +701,14 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace( // that can support error logging. libtextclassifier3::Status status = document_store_->DeleteByNamespace(name_space); - TransformStatus(status, result_status); if (!status.ok()) { ICING_LOG(ERROR) << status.error_message() << "Failed to delete Namespace: " << name_space; + TransformStatus(status, result_status); return delete_result; } + + result_status->set_code(StatusProto::OK); return delete_result; } @@ -707,15 +729,82 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( // that can support error logging. libtextclassifier3::Status status = document_store_->DeleteBySchemaType(schema_type); - TransformStatus(status, result_status); if (!status.ok()) { ICING_LOG(ERROR) << status.error_message() << "Failed to delete SchemaType: " << schema_type; + TransformStatus(status, result_status); return delete_result; } + + result_status->set_code(StatusProto::OK); return delete_result; } +DeleteResultProto IcingSearchEngine::DeleteByQuery( + const SearchSpecProto& search_spec) { + ICING_VLOG(1) << "Deleting documents for query " << search_spec.query() + << " from doc store"; + + DeleteResultProto result_proto; + StatusProto* result_status = result_proto.mutable_status(); + + absl_ports::unique_lock l(&mutex_); + if (!initialized_) { + result_status->set_code(StatusProto::FAILED_PRECONDITION); + result_status->set_message("IcingSearchEngine has not been initialized!"); + return result_proto; + } + + libtextclassifier3::Status status = + ValidateSearchSpec(search_spec, performance_configuration_); + if (!status.ok()) { + TransformStatus(status, result_status); + return result_proto; + } + + // Gets unordered results from query processor + auto query_processor_or = QueryProcessor::Create( + index_.get(), language_segmenter_.get(), normalizer_.get(), + document_store_.get(), schema_store_.get(), clock_.get()); + if (!query_processor_or.ok()) { + TransformStatus(query_processor_or.status(), result_status); + return result_proto; + } + std::unique_ptr<QueryProcessor> query_processor = + std::move(query_processor_or).ValueOrDie(); + + auto query_results_or = query_processor->ParseSearch(search_spec); + if (!query_results_or.ok()) { + TransformStatus(query_results_or.status(), result_status); + return result_proto; + } + QueryProcessor::QueryResults query_results = + std::move(query_results_or).ValueOrDie(); + + ICING_LOG(ERROR) << "Deleting the docs that matched the query."; + bool found_results = false; + while (query_results.root_iterator->Advance().ok()) { + ICING_LOG(ERROR) + << "Deleting doc " + << query_results.root_iterator->doc_hit_info().document_id(); + found_results = true; + status = document_store_->Delete( + query_results.root_iterator->doc_hit_info().document_id()); + if (!status.ok()) { + TransformStatus(status, result_status); + return result_proto; + } + } + if (found_results) { + result_proto.mutable_status()->set_code(StatusProto::OK); + } else { + result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); + result_proto.mutable_status()->set_message( + "No documents matched the query to delete by!"); + } + return result_proto; +} + PersistToDiskResultProto IcingSearchEngine::PersistToDisk() { ICING_VLOG(1) << "Persisting data to disk"; @@ -1147,6 +1236,9 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { // Ensures that current directory is still present. if (!filesystem_->CreateDirectoryRecursively( current_document_dir.c_str())) { + // Can't even create the old directory. Mark as uninitialized and return + // INTERNAL. + initialized_ = false; return absl_ports::InternalError( "Failed to create file directory for document store"); } @@ -1159,6 +1251,9 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { // TODO(b/144458732): Implement a more robust version of // TC_ASSIGN_OR_RETURN that can support error logging. if (!document_store_or.ok()) { + // Unable to create DocumentStore from the old file. Mark as uninitialized + // and return INTERNAL. + initialized_ = false; ICING_LOG(ERROR) << "Failed to create document store instance"; return absl_ports::Annotate( absl_ports::InternalError("Failed to create document store instance"), @@ -1173,13 +1268,18 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { } // Recreates the doc store instance - ICING_ASSIGN_OR_RETURN( - document_store_, + auto document_store_or = DocumentStore::Create(filesystem_.get(), current_document_dir, - clock_.get(), schema_store_.get()), - absl_ports::InternalError( - "Document store has been optimized, but a valid document store " - "instance can't be created")); + clock_.get(), schema_store_.get()); + if (!document_store_or.ok()) { + // Unable to create DocumentStore from the new file. Mark as uninitialized + // and return INTERNAL. + initialized_ = false; + return absl_ports::InternalError( + "Document store has been optimized, but a valid document store " + "instance can't be created"); + } + document_store_ = std::move(document_store_or).ValueOrDie(); // Deletes tmp directory if (!filesystem_->DeleteDirectoryRecursively( diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index 6ae76d7..55d6b2f 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -128,6 +128,9 @@ class IcingSearchEngine { // // Returns: // OK on success + // ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same + // type or contains a type that has multiple properties with the same + // name. // INVALID_ARGUMENT if 'new_schema' is invalid // FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine // has not been initialized yet. @@ -256,6 +259,21 @@ class IcingSearchEngine { DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type) ICING_LOCKS_EXCLUDED(mutex_); + // Deletes all Documents that match the query specified in search_spec. Delete + // changes are automatically applied to disk, callers can also call + // PersistToDisk() to flush changes immediately. + // + // NOTE: Space is not reclaimed for deleted documents until Optimize() is + // called. + // + // Returns: + // OK on success + // NOT_FOUND if the query doesn't match any documents + // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet + // INTERNAL_ERROR on IO error + DeleteResultProto DeleteByQuery(const SearchSpecProto& search_spec) + ICING_LOCKS_EXCLUDED(mutex_); + // Retrieves, scores, ranks, and returns the results according to the specs. // Results can be empty. If there're multiple pages of results, // SearchResultProto.next_page_token will be populated and that can be used to diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index b0946c9..5a8bb80 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -55,6 +55,7 @@ using ::testing::HasSubstr; using ::testing::IsEmpty; using ::testing::Lt; using ::testing::Matcher; +using ::testing::Ne; using ::testing::Return; using ::testing::SizeIs; using ::testing::StrEq; @@ -470,6 +471,163 @@ TEST_F(IcingSearchEngineTest, FailToWriteSchema) { HasSubstr("Unable to open file for write")); } +TEST_F(IcingSearchEngineTest, SetSchemaDelete2) { + { + IcingSearchEngine icing(GetDefaultIcingOptions()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + + // 1. Create a schema with an Email type with properties { "title", "body"} + SchemaProto schema; + SchemaTypeConfigProto* type = schema.add_types(); + type->set_schema_type("Email"); + PropertyConfigProto* property = type->add_properties(); + property->set_property_name("title"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property = type->add_properties(); + property->set_property_name("body"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK)); + + // 2. Add an email document + DocumentProto doc = DocumentBuilder() + .SetKey("emails", "email#1") + .SetSchema("Email") + .AddStringProperty("title", "Hello world.") + .AddStringProperty("body", "Goodnight Moon.") + .Build(); + EXPECT_THAT(icing.Put(std::move(doc)).status().code(), Eq(StatusProto::OK)); + } + + { + IcingSearchEngine icing(GetDefaultIcingOptions()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + + // 3. Set a schema that deletes email. This should fail. + SchemaProto schema; + SchemaTypeConfigProto* type = schema.add_types(); + type->set_schema_type("Message"); + PropertyConfigProto* property = type->add_properties(); + property->set_property_name("body"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + EXPECT_THAT(icing.SetSchema(schema, false).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + + // 4. Try to delete by email type. + EXPECT_THAT(icing.DeleteBySchemaType("Email").status().code(), + Eq(StatusProto::OK)); + } +} + +TEST_F(IcingSearchEngineTest, SetSchemaDelete) { + { + IcingSearchEngine icing(GetDefaultIcingOptions()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + + // 1. Create a schema with an Email type with properties { "title", "body"} + SchemaProto schema; + SchemaTypeConfigProto* type = schema.add_types(); + type->set_schema_type("Email"); + PropertyConfigProto* property = type->add_properties(); + property->set_property_name("title"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property = type->add_properties(); + property->set_property_name("body"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK)); + + // 2. Add an email document + DocumentProto doc = DocumentBuilder() + .SetKey("emails", "email#1") + .SetSchema("Email") + .AddStringProperty("title", "Hello world.") + .AddStringProperty("body", "Goodnight Moon.") + .Build(); + EXPECT_THAT(icing.Put(std::move(doc)).status().code(), Eq(StatusProto::OK)); + } + + { + IcingSearchEngine icing(GetDefaultIcingOptions()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + + // 3. Set a schema that deletes email. This should fail. + SchemaProto schema; + SchemaTypeConfigProto* type = schema.add_types(); + type->set_schema_type("Message"); + PropertyConfigProto* property = type->add_properties(); + property->set_property_name("body"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + EXPECT_THAT(icing.SetSchema(schema, true).status().code(), + Eq(StatusProto::OK)); + + // 4. Try to delete by email type. + EXPECT_THAT(icing.DeleteBySchemaType("Email").status().code(), + Eq(StatusProto::NOT_FOUND)); + } +} + +TEST_F(IcingSearchEngineTest, SetSchemaDuplicateTypesReturnsAlreadyExists) { + IcingSearchEngine icing(GetDefaultIcingOptions()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + + // Create a schema with types { "Email", "Message" and "Email" } + SchemaProto schema; + SchemaTypeConfigProto* type = schema.add_types(); + type->set_schema_type("Email"); + PropertyConfigProto* property = type->add_properties(); + property->set_property_name("title"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + type = schema.add_types(); + type->set_schema_type("Message"); + property = type->add_properties(); + property->set_property_name("body"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + *schema.add_types() = schema.types(0); + + EXPECT_THAT(icing.SetSchema(schema).status().code(), + Eq(StatusProto::ALREADY_EXISTS)); +} + +TEST_F(IcingSearchEngineTest, + SetSchemaDuplicatePropertiesReturnsAlreadyExists) { + IcingSearchEngine icing(GetDefaultIcingOptions()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + + // Create a schema with an Email type with properties { "title", "body" and + // "title" } + SchemaProto schema; + SchemaTypeConfigProto* type = schema.add_types(); + type->set_schema_type("Email"); + PropertyConfigProto* property = type->add_properties(); + property->set_property_name("title"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property = type->add_properties(); + property->set_property_name("body"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property = type->add_properties(); + property->set_property_name("title"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + EXPECT_THAT(icing.SetSchema(schema).status().code(), + Eq(StatusProto::ALREADY_EXISTS)); +} + TEST_F(IcingSearchEngineTest, SetSchema) { IcingSearchEngine icing(GetDefaultIcingOptions()); ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); @@ -1519,6 +1677,82 @@ TEST_F(IcingSearchEngineTest, DeleteShouldWorkAfterOptimization) { EqualsProto(expected_get_result_proto)); } +TEST_F(IcingSearchEngineTest, OptimizationFailureUninitializesIcing) { + // Setup filesystem to fail + auto mock_filesystem = std::make_unique<MockFilesystem>(); + bool just_swapped_files = false; + auto create_dir_lambda = [this, &just_swapped_files](const char* dir_name) { + if (just_swapped_files) { + // We should fail the first call immediately after swapping files. + just_swapped_files = false; + return false; + } + return filesystem()->CreateDirectoryRecursively(dir_name); + }; + ON_CALL(*mock_filesystem, CreateDirectoryRecursively) + .WillByDefault(create_dir_lambda); + auto swap_lambda = [&just_swapped_files](const char* first_dir, + const char* second_dir) { + just_swapped_files = true; + return false; + }; + ON_CALL(*mock_filesystem, SwapFiles).WillByDefault(swap_lambda); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::move(mock_filesystem), + std::make_unique<FakeClock>()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + + // The mocks should cause an unrecoverable error during Optimize - returning + // INTERNAL. + ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::INTERNAL)); + + // Ordinary operations should fail safely. + SchemaProto simple_schema; + auto type = simple_schema.add_types(); + type->set_schema_type("type0"); + auto property = type->add_properties(); + property->set_property_name("prop0"); + property->set_data_type(PropertyConfigProto::DataType::STRING); + property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + + DocumentProto simple_doc = DocumentBuilder() + .SetKey("namespace0", "uri0") + .SetSchema("type0") + .AddStringProperty("prop0", "foo") + .Build(); + + SearchSpecProto search_spec; + search_spec.set_query("foo"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + ResultSpecProto result_spec; + ScoringSpecProto scoring_spec; + scoring_spec.set_rank_by( + ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); + + EXPECT_THAT(icing.SetSchema(simple_schema).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(icing.Put(simple_doc).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT( + icing.Get(simple_doc.namespace_(), simple_doc.uri()).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT( + icing.Search(search_spec, scoring_spec, result_spec).status().code(), + Eq(StatusProto::FAILED_PRECONDITION)); + + // Reset should get icing back to a safe (empty) and working state. + EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(icing.SetSchema(simple_schema).status().code(), + Eq(StatusProto::OK)); + EXPECT_THAT(icing.Put(simple_doc).status().code(), Eq(StatusProto::OK)); + EXPECT_THAT( + icing.Get(simple_doc.namespace_(), simple_doc.uri()).status().code(), + Eq(StatusProto::OK)); + EXPECT_THAT( + icing.Search(search_spec, scoring_spec, result_spec).status().code(), + Eq(StatusProto::OK)); +} + TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { SchemaProto schema; // Add an email type @@ -1528,6 +1762,10 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { property->set_property_name("subject"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property->mutable_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + property->mutable_indexing_config()->set_tokenizer_type( + IndexingConfig::TokenizerType::PLAIN); // Add an message type type = schema.add_types(); type->set_schema_type("message"); @@ -1535,6 +1773,10 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { property->set_property_name("body"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property->mutable_indexing_config()->set_term_match_type( + TermMatchType::EXACT_ONLY); + property->mutable_indexing_config()->set_tokenizer_type( + IndexingConfig::TokenizerType::PLAIN); DocumentProto document1 = DocumentBuilder() .SetKey("namespace1", "uri1") @@ -1550,6 +1792,74 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { .SetCreationTimestampMs(kDefaultCreationTimestampMs) .Build(); IcingSearchEngine icing(GetDefaultIcingOptions()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK)); + ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK)); + ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK)); + + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document1; + EXPECT_THAT(icing.Get("namespace1", "uri1"), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT(icing.Get("namespace2", "uri2"), + EqualsProto(expected_get_result_proto)); + + // Delete the first type. The first doc should be irretrievable. The + // second should still be present. + EXPECT_THAT(icing.DeleteBySchemaType("message").status().code(), + Eq(StatusProto::OK)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); + expected_get_result_proto.mutable_status()->set_message( + "Document (namespace1, uri1) not found."); + expected_get_result_proto.clear_document(); + EXPECT_THAT(icing.Get("namespace1", "uri1"), + EqualsProto(expected_get_result_proto)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + expected_get_result_proto.mutable_status()->clear_message(); + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT(icing.Get("namespace2", "uri2"), + EqualsProto(expected_get_result_proto)); + + // Search for "message", only document2 should show up. + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document2; + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_query("message"); + EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()), + EqualsProto(expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, DeleteSchemaTypeByQuery) { + SchemaProto schema = CreateMessageSchema(); + // Add an email type + SchemaProto tmp = CreateEmailSchema(); + *schema.add_types() = tmp.types(0); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema(schema.types(0).schema_type()) + .AddStringProperty("body", "message body1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema(schema.types(1).schema_type()) + .AddStringProperty("subject", "subject subject2") + .AddStringProperty("body", "message body2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + IcingSearchEngine icing(GetDefaultIcingOptions()); EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK)); EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK)); @@ -1567,7 +1877,9 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { // Delete the first type. The first doc should be irretrievable. The // second should still be present. - EXPECT_THAT(icing.DeleteBySchemaType("message").status().code(), + SearchSpecProto search_spec; + search_spec.add_schema_type_filters(schema.types(0).schema_type()); + EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(), Eq(StatusProto::OK)); expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); @@ -1582,6 +1894,18 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { *expected_get_result_proto.mutable_document() = document2; EXPECT_THAT(icing.Get("namespace2", "uri2"), EqualsProto(expected_get_result_proto)); + + search_spec = SearchSpecProto::default_instance(); + search_spec.set_query("message"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document2; + EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()), + EqualsProto(expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, DeleteByNamespace) { @@ -1594,6 +1918,89 @@ TEST_F(IcingSearchEngineTest, DeleteByNamespace) { .Build(); DocumentProto document2 = DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document3 = + DocumentBuilder() + .SetKey("namespace3", "uri3") + .SetSchema("Message") + .AddStringProperty("body", "message body2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + IcingSearchEngine icing(GetDefaultIcingOptions()); + ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(), + Eq(StatusProto::OK)); + ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK)); + ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK)); + ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK)); + + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document1; + EXPECT_THAT(icing.Get("namespace1", "uri1"), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT(icing.Get("namespace1", "uri2"), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document3; + EXPECT_THAT(icing.Get("namespace3", "uri3"), + EqualsProto(expected_get_result_proto)); + + // Delete namespace1. Document1 and document2 should be irretrievable. + // Document3 should still be present. + EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(), + Eq(StatusProto::OK)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); + expected_get_result_proto.mutable_status()->set_message( + "Document (namespace1, uri1) not found."); + expected_get_result_proto.clear_document(); + EXPECT_THAT(icing.Get("namespace1", "uri1"), + EqualsProto(expected_get_result_proto)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); + expected_get_result_proto.mutable_status()->set_message( + "Document (namespace1, uri2) not found."); + expected_get_result_proto.clear_document(); + EXPECT_THAT(icing.Get("namespace1", "uri2"), + EqualsProto(expected_get_result_proto)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + expected_get_result_proto.mutable_status()->clear_message(); + *expected_get_result_proto.mutable_document() = document3; + EXPECT_THAT(icing.Get("namespace3", "uri3"), + EqualsProto(expected_get_result_proto)); + + // Search for "message", only document3 should show up. + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document3; + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_query("message"); + EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()), + EqualsProto(expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, DeleteNamespaceByQuery) { + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() .SetKey("namespace2", "uri2") .SetSchema("Message") .AddStringProperty("body", "message body2") @@ -1619,7 +2026,76 @@ TEST_F(IcingSearchEngineTest, DeleteByNamespace) { // Delete the first namespace. The first doc should be irretrievable. The // second should still be present. - EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(), + SearchSpecProto search_spec; + search_spec.add_namespace_filters("namespace1"); + EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(), + Eq(StatusProto::OK)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); + expected_get_result_proto.mutable_status()->set_message( + "Document (namespace1, uri1) not found."); + expected_get_result_proto.clear_document(); + EXPECT_THAT(icing.Get("namespace1", "uri1"), + EqualsProto(expected_get_result_proto)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + expected_get_result_proto.mutable_status()->clear_message(); + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT(icing.Get("namespace2", "uri2"), + EqualsProto(expected_get_result_proto)); + + search_spec = SearchSpecProto::default_instance(); + search_spec.set_query("message"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document2; + EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()), + EqualsProto(expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, DeleteByQuery) { + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + IcingSearchEngine icing(GetDefaultIcingOptions()); + EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(), + Eq(StatusProto::OK)); + EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK)); + + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document1; + EXPECT_THAT(icing.Get("namespace1", "uri1"), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT(icing.Get("namespace2", "uri2"), + EqualsProto(expected_get_result_proto)); + + // Delete all docs containing 'body1'. The first doc should be irretrievable. + // The second should still be present. + SearchSpecProto search_spec; + search_spec.set_query("body1"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(), Eq(StatusProto::OK)); expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); @@ -1634,6 +2110,86 @@ TEST_F(IcingSearchEngineTest, DeleteByNamespace) { *expected_get_result_proto.mutable_document() = document2; EXPECT_THAT(icing.Get("namespace2", "uri2"), EqualsProto(expected_get_result_proto)); + + search_spec = SearchSpecProto::default_instance(); + search_spec.set_query("message"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document2; + EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()), + EqualsProto(expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) { + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + IcingSearchEngine icing(GetDefaultIcingOptions()); + EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(), + Eq(StatusProto::OK)); + EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK)); + EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK)); + + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document1; + EXPECT_THAT(icing.Get("namespace1", "uri1"), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT(icing.Get("namespace2", "uri2"), + EqualsProto(expected_get_result_proto)); + + // Delete all docs containing 'foo', which should be none of them. Both docs + // should still be present. + SearchSpecProto search_spec; + search_spec.set_query("foo"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + EXPECT_THAT(icing.DeleteByQuery(search_spec).status().code(), + Eq(StatusProto::NOT_FOUND)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + expected_get_result_proto.mutable_status()->clear_message(); + *expected_get_result_proto.mutable_document() = document1; + EXPECT_THAT(icing.Get("namespace1", "uri1"), + EqualsProto(expected_get_result_proto)); + + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + expected_get_result_proto.mutable_status()->clear_message(); + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT(icing.Get("namespace2", "uri2"), + EqualsProto(expected_get_result_proto)); + + search_spec = SearchSpecProto::default_instance(); + search_spec.set_query("message"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document2; + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document1; + EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()), + EqualsProto(expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) { diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 00d116f..eb01731 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -31,6 +31,7 @@ #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" #include "icing/util/logging.h" +#include "unicode/uloc.h" // Run on a Linux workstation: // $ blaze build -c opt --dynamic_mode=off --copt=-gmlt @@ -192,8 +193,9 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) { CleanUp(filesystem, index_dir); std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir); + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(); std::unique_ptr<IndexProcessor> index_processor = @@ -239,8 +241,9 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) { CleanUp(filesystem, index_dir); std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir); + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(); std::unique_ptr<IndexProcessor> index_processor = @@ -287,8 +290,9 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) { CleanUp(filesystem, index_dir); std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir); + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(); std::unique_ptr<IndexProcessor> index_processor = @@ -335,8 +339,9 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) { CleanUp(filesystem, index_dir); std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir); + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(); std::unique_ptr<IndexProcessor> index_processor = diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 8dfb9c2..824c440 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -47,6 +47,7 @@ #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -91,8 +92,10 @@ class IndexProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &icing_filesystem_)); - ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(segmenter_options))); ICING_ASSERT_OK_AND_ASSIGN( normalizer_, diff --git a/icing/index/index.cc b/icing/index/index.cc index d4a2508..0b014d9 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -24,8 +24,8 @@ #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/index/hit/hit.h" -#include "icing/index/iterator/doc-hit-info-iterator-term.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/lite/doc-hit-info-iterator-term-lite.h" #include "icing/index/lite/lite-index.h" #include "icing/index/term-id-codec.h" #include "icing/index/term-property-id.h" @@ -102,10 +102,10 @@ Index::GetIterator(const std::string& term, SectionIdMask section_id_mask, TermMatchType::Code term_match_type) { switch (term_match_type) { case TermMatchType::EXACT_ONLY: - return std::make_unique<DocHitInfoIteratorTermExact>( + return std::make_unique<DocHitInfoIteratorTermLiteExact>( term_id_codec_.get(), lite_index_.get(), term, section_id_mask); case TermMatchType::PREFIX: - return std::make_unique<DocHitInfoIteratorTermPrefix>( + return std::make_unique<DocHitInfoIteratorTermLitePrefix>( term_id_codec_.get(), lite_index_.get(), term, section_id_mask); default: return absl_ports::InvalidArgumentError( @@ -159,13 +159,18 @@ libtextclassifier3::Status Index::Editor::AddHit(const char* term, Hit::Score score) { // Step 1: See if this term is already in the lexicon uint32_t tvi; - auto tvi_or = lite_index_->FindTerm(term); + auto tvi_or = lite_index_->GetTermId(term); // Step 2: Update the lexicon, either add the term or update its properties if (tvi_or.ok()) { + tvi = tvi_or.ValueOrDie(); + if (seen_tokens_.find(tvi) != seen_tokens_.end()) { + ICING_VLOG(1) << "A hit for term " << term + << " has already been added. Skipping."; + return libtextclassifier3::Status::OK; + } ICING_VLOG(1) << "Term " << term << " is already present in lexicon. Updating."; - tvi = tvi_or.ValueOrDie(); // Already in the lexicon. Just update the properties. ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties( tvi, term_match_type_ == TermMatchType::PREFIX, namespace_id_)); @@ -175,6 +180,7 @@ libtextclassifier3::Status Index::Editor::AddHit(const char* term, ICING_ASSIGN_OR_RETURN( tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_)); } + seen_tokens_.insert(tvi); // Step 3: Add the hit itself Hit hit(section_id_, document_id_, score, diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index 070e82a..f7ca285 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -37,6 +37,7 @@ #include "icing/testing/common-matchers.h" #include "icing/testing/random-string.h" #include "icing/testing/tmp-directory.h" +#include "icing/util/crc32.h" namespace icing { namespace lib { @@ -48,6 +49,7 @@ using ::testing::Eq; using ::testing::Gt; using ::testing::IsEmpty; using ::testing::IsTrue; +using ::testing::Ne; using ::testing::NiceMock; using ::testing::Not; using ::testing::SizeIs; @@ -255,11 +257,16 @@ TEST_F(IndexTest, MultiHitSectionRestrict) { } TEST_F(IndexTest, SingleHitDedupeIndex) { + Crc32 empty_crc = index_->ComputeChecksum(); // Act Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); EXPECT_THAT(edit.AddHit("foo"), IsOk()); + Crc32 first_hit_crc = index_->ComputeChecksum(); + EXPECT_THAT(first_hit_crc.Get(), Ne(empty_crc.Get())); EXPECT_THAT(edit.AddHit("foo"), IsOk()); + Crc32 second_hit_crc = index_->ComputeChecksum(); + EXPECT_THAT(second_hit_crc.Get(), Eq(first_hit_crc.Get())); // Assert ICING_ASSERT_OK_AND_ASSIGN( diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc index 482a5ab..c6cb86d 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc @@ -82,12 +82,10 @@ libtextclassifier3::Status DocHitInfoIteratorFilter::Advance() { "Couldn't get current time. Try again in a bit"); } - if (options_.filter_deleted) { - if (!document_store_.DoesDocumentExist( - delegate_->doc_hit_info().document_id())) { - // Document doesn't exist, keep searching - return Advance(); - } + if (!document_store_.DoesDocumentExist( + delegate_->doc_hit_info().document_id())) { + // Document doesn't exist, keep searching + return Advance(); } // Try to get the DocumentFilterData diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h index bf027e4..9119610 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.h +++ b/icing/index/iterator/doc-hit-info-iterator-filter.h @@ -37,10 +37,6 @@ namespace lib { class DocHitInfoIteratorFilter : public DocHitInfoIterator { public: struct Options { - // Filter out/don't return DocHitInfos that are associated with nonexistent - // Documents. - bool filter_deleted = true; - // List of namespaces that documents must have. An empty vector means that // all namespaces are valid, and no documents will be filtered out. // diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc index e769013..9eb147a 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc @@ -105,33 +105,6 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, EmptyOriginalIterator) { EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); } -TEST_F(DocHitInfoIteratorDeletedFilterTest, TurnOffDeletedFilterOk) { - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - document_store_->Put(test_document1_)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, - document_store_->Put(test_document2_)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, - document_store_->Put(test_document3_)); - - // Deletes test document 2 - ICING_ASSERT_OK(document_store_->Delete(test_document2_.namespace_(), - test_document2_.uri())); - - std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1), - DocHitInfo(document_id2), - DocHitInfo(document_id3)}; - std::unique_ptr<DocHitInfoIterator> original_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - - options_.filter_deleted = false; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); - - EXPECT_THAT(GetDocumentIds(&filtered_iterator), - ElementsAre(document_id1, document_id2, document_id3)); -} - TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) { ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, document_store_->Put(test_document1_)); diff --git a/icing/index/iterator/doc-hit-info-iterator-term.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc index 97ca3c4..1f1c296 100644 --- a/icing/index/iterator/doc-hit-info-iterator-term.cc +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/index/iterator/doc-hit-info-iterator-term.h" +#include "icing/index/lite/doc-hit-info-iterator-term-lite.h" #include <cstdint> @@ -40,7 +40,7 @@ std::string SectionIdMaskToString(SectionIdMask section_id_mask) { } // namespace -libtextclassifier3::Status DocHitInfoIteratorTerm::Advance() { +libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() { if (cached_hits_idx_ == -1) { ICING_RETURN_IF_ERROR(RetrieveMoreHits()); } else { @@ -59,9 +59,9 @@ libtextclassifier3::Status DocHitInfoIteratorTerm::Advance() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status DocHitInfoIteratorTermExact::RetrieveMoreHits() { +libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() { // Exact match only. All hits in lite lexicon are exact. - ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_)); + ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_)); ICING_ASSIGN_OR_RETURN(uint32_t term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); lite_index_->AppendHits(term_id, section_restrict_mask_, @@ -70,12 +70,13 @@ libtextclassifier3::Status DocHitInfoIteratorTermExact::RetrieveMoreHits() { return libtextclassifier3::Status::OK; } -std::string DocHitInfoIteratorTermExact::ToString() const { +std::string DocHitInfoIteratorTermLiteExact::ToString() const { return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":", term_); } -libtextclassifier3::Status DocHitInfoIteratorTermPrefix::RetrieveMoreHits() { +libtextclassifier3::Status +DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() { // Take union of lite terms. int term_len = term_.length(); int terms_matched = 0; @@ -97,7 +98,7 @@ libtextclassifier3::Status DocHitInfoIteratorTermPrefix::RetrieveMoreHits() { return libtextclassifier3::Status::OK; } -void DocHitInfoIteratorTermPrefix::SortAndDedupeDocumentIds() { +void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() { // Re-sort cached document_ids and merge sections. sort(cached_hits_.begin(), cached_hits_.end()); @@ -116,7 +117,7 @@ void DocHitInfoIteratorTermPrefix::SortAndDedupeDocumentIds() { cached_hits_.resize(idx + 1); } -std::string DocHitInfoIteratorTermPrefix::ToString() const { +std::string DocHitInfoIteratorTermLitePrefix::ToString() const { return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":", term_, "*"); } diff --git a/icing/index/iterator/doc-hit-info-iterator-term.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h index 21d1dd6..bd2de6d 100644 --- a/icing/index/iterator/doc-hit-info-iterator-term.h +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_ -#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_ +#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ +#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ #include <cstdint> #include <vector> @@ -28,11 +28,12 @@ namespace icing { namespace lib { -class DocHitInfoIteratorTerm : public DocHitInfoIterator { +class DocHitInfoIteratorTermLite : public DocHitInfoIterator { public: - explicit DocHitInfoIteratorTerm(const TermIdCodec* term_id_codec, - LiteIndex* lite_index, const std::string term, - SectionIdMask section_restrict_mask) + explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec, + LiteIndex* lite_index, + const std::string& term, + SectionIdMask section_restrict_mask) : term_(term), lite_index_(lite_index), cached_hits_idx_(-1), @@ -66,14 +67,14 @@ class DocHitInfoIteratorTerm : public DocHitInfoIterator { const SectionIdMask section_restrict_mask_; }; -class DocHitInfoIteratorTermExact : public DocHitInfoIteratorTerm { +class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite { public: - explicit DocHitInfoIteratorTermExact(const TermIdCodec* term_id_codec, - LiteIndex* lite_index, - const std::string& term, - SectionIdMask section_id_mask) - : DocHitInfoIteratorTerm(term_id_codec, lite_index, term, - section_id_mask) {} + explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec, + LiteIndex* lite_index, + const std::string& term, + SectionIdMask section_id_mask) + : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term, + section_id_mask) {} std::string ToString() const override; @@ -81,14 +82,14 @@ class DocHitInfoIteratorTermExact : public DocHitInfoIteratorTerm { libtextclassifier3::Status RetrieveMoreHits() override; }; -class DocHitInfoIteratorTermPrefix : public DocHitInfoIteratorTerm { +class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite { public: - explicit DocHitInfoIteratorTermPrefix(const TermIdCodec* term_id_codec, - LiteIndex* lite_index, - const std::string& term, - SectionIdMask section_id_mask) - : DocHitInfoIteratorTerm(term_id_codec, lite_index, term, - section_id_mask) {} + explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec, + LiteIndex* lite_index, + const std::string& term, + SectionIdMask section_id_mask) + : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term, + section_id_mask) {} std::string ToString() const override; @@ -105,4 +106,4 @@ class DocHitInfoIteratorTermPrefix : public DocHitInfoIteratorTerm { } // namespace lib } // namespace icing -#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_ +#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index a72402e..89240ee 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -65,8 +65,8 @@ size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); } } // namespace -const LiteIndex::Element::Value LiteIndex::Element::kInvalidValue = - LiteIndex::Element(0, Hit()).value(); +const TermIdHitPair::Value TermIdHitPair::kInvalidValue = + TermIdHitPair(0, Hit()).value(); libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create( const LiteIndex::Options& options, const IcingFilesystem* filesystem) { @@ -163,7 +163,7 @@ libtextclassifier3::Status LiteIndex::Initialize() { header_->Reset(); if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true, - sizeof(Element::Value), header_->cur_size(), + sizeof(TermIdHitPair::Value), header_->cur_size(), options_.hit_buffer_size, &hit_buffer_crc_, true)) { status = absl_ports::InternalError("Failed to initialize new hit buffer"); goto error; @@ -177,7 +177,7 @@ libtextclassifier3::Status LiteIndex::Initialize() { header_mmap_.address())); if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true, - sizeof(Element::Value), header_->cur_size(), + sizeof(TermIdHitPair::Value), header_->cur_size(), options_.hit_buffer_size, &hit_buffer_crc_, true)) { status = absl_ports::InternalError( "Failed to re-initialize existing hit buffer"); @@ -312,20 +312,21 @@ libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) { header_->set_last_added_docid(hit.document_id()); - Element elt(term_id, hit); + TermIdHitPair term_id_hit_pair(term_id, hit); uint32_t cur_size = header_->cur_size(); - Element::Value* valp = hit_buffer_.GetMutableMem<Element::Value>(cur_size, 1); + TermIdHitPair::Value* valp = + hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1); if (valp == nullptr) { return absl_ports::ResourceExhaustedError( "Allocating more space in hit buffer failed!"); } - *valp = elt.value(); + *valp = term_id_hit_pair.value(); header_->set_cur_size(cur_size + 1); return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm( +libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId( const std::string& term) const { char dummy; uint32_t tvi; @@ -336,16 +337,17 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm( return tvi; } -uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, - bool only_from_prefix_sections, - std::vector<DocHitInfo>* hits_out) { - uint32_t count = 0; +int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + std::vector<DocHitInfo>* hits_out) { + int count = 0; DocumentId last_document_id = kInvalidDocumentId; for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) { - Element elt(hit_buffer_.array_cast<Element>()[idx]); - if (elt.term_id() != term_id) break; + TermIdHitPair term_id_hit_pair( + hit_buffer_.array_cast<TermIdHitPair>()[idx]); + if (term_id_hit_pair.term_id() != term_id) break; - const Hit& hit = elt.hit(); + const Hit& hit = term_id_hit_pair.hit(); // Check sections. if (((1u << hit.section_id()) & section_id_mask) == 0) { continue; @@ -356,7 +358,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, } DocumentId document_id = hit.document_id(); if (document_id != last_document_id) { - count++; + ++count; if (hits_out != nullptr) { hits_out->push_back(DocHitInfo(document_id)); } @@ -369,7 +371,7 @@ uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, return count; } -uint32_t LiteIndex::CountHits(uint32_t term_id) { +int LiteIndex::CountHits(uint32_t term_id) { return AppendHits(term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false, /*hits_out=*/nullptr); @@ -421,8 +423,8 @@ uint32_t LiteIndex::Seek(uint32_t term_id) { IcingTimer timer; auto* array_start = - hit_buffer_.GetMutableMem<Element::Value>(0, header_->cur_size()); - Element::Value* sort_start = array_start + header_->searchable_end(); + hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size()); + TermIdHitPair::Value* sort_start = array_start + header_->searchable_end(); std::sort(sort_start, array_start + header_->cur_size()); // Now merge with previous region. Since the previous region is already @@ -445,11 +447,13 @@ uint32_t LiteIndex::Seek(uint32_t term_id) { // Binary search for our term_id. Make sure we get the first // element. Using kBeginSortValue ensures this for the hit value. - Element elt(term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore)); + TermIdHitPair term_id_hit_pair( + term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore)); - const Element::Value* array = hit_buffer_.array_cast<Element::Value>(); - const Element::Value* ptr = - std::lower_bound(array, array + header_->cur_size(), elt.value()); + const TermIdHitPair::Value* array = + hit_buffer_.array_cast<TermIdHitPair::Value>(); + const TermIdHitPair::Value* ptr = std::lower_bound( + array, array + header_->cur_size(), term_id_hit_pair.value()); return ptr - array; } diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index b60a947..27ccf33 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -30,6 +30,7 @@ #include "icing/file/filesystem.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/hit/hit.h" +#include "icing/index/lite/term-id-hit-pair.h" #include "icing/legacy/index/icing-array-storage.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" @@ -49,49 +50,6 @@ namespace lib { class LiteIndex { public: // An entry in the hit buffer. - class Element { - public: - // Layout bits: 24 termid + 32 hit value + 8 hit score. - using Value = uint64_t; - - static constexpr int kTermIdBits = 24; - static constexpr int kHitValueBits = sizeof(Hit::Value) * 8; - static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8; - - static const Value kInvalidValue; - - explicit Element(Value v = kInvalidValue) : value_(v) {} - - Element(uint32_t term_id, const Hit& hit) { - static_assert( - kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8, - "LiteIndexElementTooBig"); - - value_ = 0; - // Term id goes into the most significant bits because it takes - // precedent in sorts. - bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits, - &value_); - bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_); - bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_); - } - - uint32_t term_id() const { - return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits, - kTermIdBits); - } - - Hit hit() const { - return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits), - bit_util::BitfieldGet(value_, 0, kHitScoreBits)); - } - - Value value() const { return value_; } - - private: - Value value_; - }; - using Options = IcingLiteIndexOptions; // Updates checksum of subcomponents. @@ -126,7 +84,7 @@ class LiteIndex { Crc32 ComputeChecksum(); // Returns term_id if term found, NOT_FOUND otherwise. - libtextclassifier3::StatusOr<uint32_t> FindTerm( + libtextclassifier3::StatusOr<uint32_t> GetTermId( const std::string& term) const; // Returns an iterator for all terms for which 'prefix' is a prefix. @@ -170,25 +128,89 @@ class LiteIndex { NamespaceId namespace_id); // Append hit to buffer. term_id must be encoded using the same term_id_codec - // supplied to the index constructor. Returns non-OK if hit cannot be added - // (either due to hit buffer or file system capacity reached). + // supplied to the index constructor. + // RETURNS: + // - OK if hit was successfully added + // - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer + // or file system capacity reached). libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit); // Add all hits with term_id from the sections specified in section_id_mask, // skipping hits in non-prefix sections if only_from_prefix_sections is true, - // to hits_out. - uint32_t AppendHits(uint32_t term_id, SectionIdMask section_id_mask, - bool only_from_prefix_sections, - std::vector<DocHitInfo>* hits_out); + // to hits_out. If hits_out is nullptr, no hits will be added. + // + // Returns the number of hits that would be added to hits_out. + int AppendHits(uint32_t term_id, SectionIdMask section_id_mask, + bool only_from_prefix_sections, + std::vector<DocHitInfo>* hits_out); // Returns the hit count of the term. - uint32_t CountHits(uint32_t term_id); + int CountHits(uint32_t term_id); // Check if buffer has reached its capacity. bool is_full() const; + bool empty() const { return size() == 0; } + + uint32_t size() const { return header_->cur_size(); } + + class const_iterator { + friend class LiteIndex; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = TermIdHitPair; + using reference = const value_type&; + using pointer = const value_type*; + + const_iterator() : const_iterator(nullptr, -1, -1) {} + + reference operator*() const { return start_[position_]; } + + pointer operator->() const { return start_ + position_; } + + const_iterator& operator++() { + if (++position_ >= end_position_) { + start_ = nullptr; + position_ = -1; + end_position_ = -1; + } + return *this; + } + + const_iterator operator++(int) { + auto tmp = *this; + ++*this; + return tmp; + } + + bool operator!=(const const_iterator& rhs) { return !(*this == rhs); } + + bool operator==(const const_iterator& rhs) { + return start_ == rhs.start_ && position_ == rhs.position_; + } + + private: + explicit const_iterator(const TermIdHitPair* start, int position, + int end_position) + : start_(start), position_(position), end_position_(end_position) {} + + const TermIdHitPair* start_; + int position_; + int end_position_; + }; + + const_iterator begin() const { + // If the LiteIndex is empty, just return end(). + return empty() ? end() + : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0, + header_->cur_size()); + } + + const_iterator end() const { return const_iterator(); } + constexpr static uint32_t max_hit_buffer_size() { - return std::numeric_limits<uint32_t>::max() / sizeof(LiteIndex::Element); + return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair); } // We keep track of the last added document_id. This is always the largest diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h new file mode 100644 index 0000000..191f766 --- /dev/null +++ b/icing/index/lite/term-id-hit-pair.h @@ -0,0 +1,80 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_TERM_ID_HIT_PAIR_H_ +#define ICING_INDEX_TERM_ID_HIT_PAIR_H_ + +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <vector> + +#include "icing/index/hit/hit.h" +#include "icing/util/bit-util.h" + +namespace icing { +namespace lib { + +class TermIdHitPair { + public: + // Layout bits: 24 termid + 32 hit value + 8 hit score. + using Value = uint64_t; + + static constexpr int kTermIdBits = 24; + static constexpr int kHitValueBits = sizeof(Hit::Value) * 8; + static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8; + + static const Value kInvalidValue; + + explicit TermIdHitPair(Value v = kInvalidValue) : value_(v) {} + + TermIdHitPair(uint32_t term_id, const Hit& hit) { + static_assert( + kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8, + "TermIdHitPairTooBig"); + + value_ = 0; + // Term id goes into the most significant bits because it takes + // precedent in sorts. + bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits, + &value_); + bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_); + bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_); + } + + uint32_t term_id() const { + return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits, + kTermIdBits); + } + + Hit hit() const { + return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits), + bit_util::BitfieldGet(value_, 0, kHitScoreBits)); + } + + Value value() const { return value_; } + + bool operator==(const TermIdHitPair& rhs) const { + return value_ == rhs.value_; + } + + private: + Value value_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_TERM_ID_HIT_PAIR_H_ diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc new file mode 100644 index 0000000..0640135 --- /dev/null +++ b/icing/index/main/doc-hit-info-iterator-term-main.cc @@ -0,0 +1,166 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/main/doc-hit-info-iterator-term-main.h" + +#include <cstdint> +#include <memory> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/main/posting-list-accessor.h" +#include "icing/index/main/posting-list-identifier.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +std::string SectionIdMaskToString(SectionIdMask section_id_mask) { + std::string mask(kMaxSectionId + 1, '0'); + for (SectionId i = kMaxSectionId; i >= 0; --i) { + if (section_id_mask & (1U << i)) { + mask[kMaxSectionId - i] = '1'; + } + } + return mask; +} + +} // namespace + +libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() { + if (posting_list_accessor_ == nullptr || + cached_doc_hit_infos_idx_ == (cached_doc_hit_infos_.size() - 2)) { + // If we haven't retrieved any hits before or we've already returned all but + // the last cached hit, then go get some more! + // We hold back the last cached hit because it could have more hits on the + // next posting list in the chain. + ICING_RETURN_IF_ERROR(RetrieveMoreHits()); + } else { + ++cached_doc_hit_infos_idx_; + } + if (cached_doc_hit_infos_idx_ == -1 || + cached_doc_hit_infos_idx_ >= cached_doc_hit_infos_.size()) { + // Nothing more for the iterator to return. Set these members to invalid + // values. + doc_hit_info_ = DocHitInfo(); + hit_intersect_section_ids_mask_ = kSectionIdMaskNone; + return absl_ports::ResourceExhaustedError( + "No more DocHitInfos in iterator"); + } + doc_hit_info_ = cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_); + hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask(); + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() { + DocHitInfo last_doc_hit_info; + if (!cached_doc_hit_infos_.empty()) { + last_doc_hit_info = cached_doc_hit_infos_.back(); + } + cached_doc_hit_infos_idx_ = 0; + cached_doc_hit_infos_.clear(); + if (last_doc_hit_info.document_id() != kInvalidDocumentId) { + // Carry over the last hit. It might need to be merged with the first hit of + // of the next posting list in the chain. + cached_doc_hit_infos_.push_back(last_doc_hit_info); + } + if (posting_list_accessor_ == nullptr) { + ICING_ASSIGN_OR_RETURN(posting_list_accessor_, + main_index_->GetAccessorForExactTerm(term_)); + } + + ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits, + posting_list_accessor_->GetNextHitsBatch()); + ++num_blocks_inspected_; + cached_doc_hit_infos_.reserve(hits.size() + 1); + for (const Hit& hit : hits) { + // Check sections. + if (((1u << hit.section_id()) & section_restrict_mask_) == 0) { + continue; + } + // We want exact hits, skip prefix-only hits. + if (hit.is_prefix_hit()) { + continue; + } + if (cached_doc_hit_infos_.empty() || + hit.document_id() != cached_doc_hit_infos_.back().document_id()) { + cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id())); + } + cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score()); + } + return libtextclassifier3::Status::OK; +} + +std::string DocHitInfoIteratorTermMainExact::ToString() const { + return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":", + term_); +} + +libtextclassifier3::Status +DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() { + DocHitInfo last_doc_hit_info; + if (!cached_doc_hit_infos_.empty()) { + last_doc_hit_info = cached_doc_hit_infos_.back(); + } + cached_doc_hit_infos_idx_ = 0; + cached_doc_hit_infos_.clear(); + if (last_doc_hit_info.document_id() != kInvalidDocumentId) { + // Carry over the last hit. It might need to be merged with the first hit of + // of the next posting list in the chain. + cached_doc_hit_infos_.push_back(last_doc_hit_info); + } + + ++num_blocks_inspected_; + if (posting_list_accessor_ == nullptr) { + ICING_ASSIGN_OR_RETURN( + MainIndex::GetPrefixAccessorResult result, + main_index_->GetAccessorForPrefixTerm(term_)); + posting_list_accessor_ = std::move(result.accessor); + exact_ = result.exact; + } + ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits, + posting_list_accessor_->GetNextHitsBatch()); + cached_doc_hit_infos_.reserve(hits.size()); + for (const Hit& hit : hits) { + // Check sections. + if (((1u << hit.section_id()) & section_restrict_mask_) == 0) { + continue; + } + // If we only want hits from prefix sections. + if (!exact_ && !hit.is_in_prefix_section()) { + continue; + } + if (cached_doc_hit_infos_.empty() || + hit.document_id() != cached_doc_hit_infos_.back().document_id()) { + cached_doc_hit_infos_.push_back(DocHitInfo(hit.document_id())); + } + cached_doc_hit_infos_.back().UpdateSection(hit.section_id(), hit.score()); + } + return libtextclassifier3::Status::OK; +} + +std::string DocHitInfoIteratorTermMainPrefix::ToString() const { + return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":", + term_, "*"); +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h new file mode 100644 index 0000000..1f77226 --- /dev/null +++ b/icing/index/main/doc-hit-info-iterator-term-main.h @@ -0,0 +1,114 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_ +#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_ + +#include <cstdint> +#include <memory> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/index/hit/doc-hit-info.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/main/main-index.h" +#include "icing/index/main/posting-list-accessor.h" +#include "icing/schema/section.h" + +namespace icing { +namespace lib { + +class DocHitInfoIteratorTermMain : public DocHitInfoIterator { + public: + explicit DocHitInfoIteratorTermMain(MainIndex* main_index, + const std::string& term, + SectionIdMask section_restrict_mask) + : term_(term), + main_index_(main_index), + cached_doc_hit_infos_idx_(-1), + num_advance_calls_(0), + num_blocks_inspected_(0), + next_posting_list_id_(PostingListIdentifier::kInvalid), + section_restrict_mask_(section_restrict_mask) {} + + libtextclassifier3::Status Advance() override; + + int32_t GetNumBlocksInspected() const override { + return num_blocks_inspected_; + } + int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } + + protected: + // Add DocHitInfos corresponding to term_ to cached_doc_hit_infos_. + virtual libtextclassifier3::Status RetrieveMoreHits() = 0; + + const std::string term_; + // The accessor of the posting list chain for the requested term. + std::unique_ptr<PostingListAccessor> posting_list_accessor_; + + MainIndex* main_index_; + // Stores hits retrieved from the index. This may only be a subset of the hits + // that are present in the index. Current value pointed to by the Iterator is + // tracked by cached_doc_hit_infos_idx_. + std::vector<DocHitInfo> cached_doc_hit_infos_; + int cached_doc_hit_infos_idx_; + int num_advance_calls_; + int num_blocks_inspected_; + PostingListIdentifier next_posting_list_id_; + // Mask indicating which sections hits should be considered for. + // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired. + const SectionIdMask section_restrict_mask_; +}; + +class DocHitInfoIteratorTermMainExact : public DocHitInfoIteratorTermMain { + public: + explicit DocHitInfoIteratorTermMainExact(MainIndex* main_index, + const std::string& term, + SectionIdMask section_restrict_mask) + : DocHitInfoIteratorTermMain(main_index, term, section_restrict_mask) {} + + std::string ToString() const override; + + protected: + libtextclassifier3::Status RetrieveMoreHits() override; +}; + +class DocHitInfoIteratorTermMainPrefix : public DocHitInfoIteratorTermMain { + public: + explicit DocHitInfoIteratorTermMainPrefix(MainIndex* main_index, + const std::string& term, + SectionIdMask section_restrict_mask) + : DocHitInfoIteratorTermMain(main_index, term, section_restrict_mask) {} + + std::string ToString() const override; + + protected: + libtextclassifier3::Status RetrieveMoreHits() override; + + private: + // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and + // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be + // merged. + void SortAndDedupeDocumentIds(); + // Whether or not posting_list_accessor_ holds a posting list chain for + // 'term' or for a term for which 'term' is a prefix. This is necessary to + // determine whether to return hits that are not from a prefix section (hits + // not from a prefix section should only be returned if exact_ is true). + bool exact_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_ diff --git a/icing/index/main/flash-index-storage-header.h b/icing/index/main/flash-index-storage-header.h new file mode 100644 index 0000000..f81e99e --- /dev/null +++ b/icing/index/main/flash-index-storage-header.h @@ -0,0 +1,122 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_ +#define ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_ + +#include <cstdint> +#include <memory> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/filesystem.h" + +namespace icing { +namespace lib { + +// The class used to manage the flash block that contains the header for +// FlashIndexStorage. This contains information about the index blocks that +// store the posting lists. +class HeaderBlock { + public: + // The class used to access the actual header. + struct Header { + // A magic used to mark the beginning of a valid header. + static constexpr int kMagic = 0x6dfba6ae; + int magic; + int block_size; + int last_indexed_docid; + // The size of the index_block_infos array. + int num_index_block_infos; + + struct IndexBlockInfo { + // The size of the posting lists that fit on all the index blocks in this + // chain. Each block on this posting list will have posting lists of size + // posting_list_bytes. + int posting_list_bytes; + // The block index of the first block in the free list chain. + int free_list_block_index; + }; + // Variable-size array, num_index_block_infos long. Can have a max length + // of log(block_size). This array is used to maintain a free list for the + // available blocks. + IndexBlockInfo index_block_infos[0]; + }; + + // Read HeaderBlock from the specified fd. + // + // RETURNS: + // - HeaderBlock, on success + // - INTERNAL if unable to read block_size bytes from fd. + static libtextclassifier3::StatusOr<HeaderBlock> Read( + const Filesystem* filesystem, int fd, int block_size) { + std::unique_ptr<uint8_t[]> buffer = std::make_unique<uint8_t[]>(block_size); + if (!filesystem->PRead(fd, buffer.get(), block_size, 0)) { + return absl_ports::InternalError("Unable to reader header block!"); + } + return HeaderBlock(filesystem, std::move(buffer), block_size); + } + + // Make a new HeaderBlock with the specified size. + explicit HeaderBlock(const Filesystem* filesystem, int block_size) + : HeaderBlock(filesystem, std::make_unique<uint8_t[]>(block_size), + block_size) { + std::memset(header_buffer_.get(), 0, block_size); + } + + Header* header() const { + return reinterpret_cast<Header*>(header_buffer_.get()); + } + + // Add another entry to the index_block_infos array and return a pointer to + // that entry. Returns a nullptr if the index_block_infos array is already + // at a max size. + Header::IndexBlockInfo* AddIndexBlockInfo() { + if (size() + sizeof(Header::IndexBlockInfo) > block_size_) { + return nullptr; + } + ++header()->num_index_block_infos; + return header()->index_block_infos + (header()->num_index_block_infos - 1); + } + + // Returns the size of the header block currently in use. + int size() const { + return sizeof(Header) + + header()->num_index_block_infos * sizeof(Header::IndexBlockInfo); + } + + // Writes the header to fd. Returns true on success. + bool Write(int fd) { + return filesystem_->PWrite(fd, 0, header_buffer_.get(), block_size_); + } + + private: + explicit HeaderBlock(const Filesystem* filesystem, + std::unique_ptr<uint8_t[]> buffer, int block_size) + : filesystem_(filesystem), + header_buffer_(std::move(buffer)), + block_size_(block_size) {} + + const Filesystem* filesystem_; // does NOT own! + std::unique_ptr<uint8_t[]> header_buffer_; + int block_size_; +}; +static_assert(16 == sizeof(HeaderBlock::Header), + "Header has changed size. Consider how this change might affect " + "pre-existing indices."); + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_MAIN_FLASH_INDEX_STORAGE_HEADER_H_ diff --git a/icing/index/main/flash-index-storage.cc b/icing/index/main/flash-index-storage.cc new file mode 100644 index 0000000..b88d7fe --- /dev/null +++ b/icing/index/main/flash-index-storage.cc @@ -0,0 +1,511 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/main/flash-index-storage.h" + +#include <errno.h> +#include <inttypes.h> +#include <sys/types.h> + +#include <algorithm> +#include <cstdint> +#include <memory> +#include <unordered_set> + +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/memory-mapped-file.h" +#include "icing/index/main/index-block.h" +#include "icing/index/main/posting-list-free.h" +#include "icing/index/main/posting-list-utils.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/util/logging.h" +#include "icing/util/math-util.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +uint32_t SelectBlockSize() { + // This should be close to the flash page size. + static constexpr uint32_t kMinBlockSize = 4096; + + // Determine a good block size. + uint32_t page_size = getpagesize(); + uint32_t block_size = std::max(kMinBlockSize, page_size); + + // Align up to the nearest page size. + return math_util::RoundUpTo(block_size, page_size); +} + +} // namespace + +libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create( + const std::string& index_filename, const Filesystem* filesystem, + bool in_memory) { + ICING_RETURN_ERROR_IF_NULL(filesystem); + FlashIndexStorage storage(index_filename, filesystem, in_memory); + if (!storage.Init()) { + return absl_ports::InternalError( + "Unable to successfully read header block!"); + } + return storage; +} + +FlashIndexStorage::FlashIndexStorage(const std::string& index_filename, + const Filesystem* filesystem, + bool has_in_memory_freelists) + : index_filename_(index_filename), + num_blocks_(0), + filesystem_(filesystem), + has_in_memory_freelists_(has_in_memory_freelists) {} + +FlashIndexStorage::~FlashIndexStorage() { + if (header_block_ != nullptr) { + FlushInMemoryFreeList(); + PersistToDisk(); + } +} + +bool FlashIndexStorage::Init() { + block_fd_ = ScopedFd(filesystem_->OpenForWrite(index_filename_.c_str())); + if (!block_fd_.is_valid()) { + return false; + } + + // Read in or create the header. + return InitHeader(); +} + +bool FlashIndexStorage::InitHeader() { + // Look for an existing file size. + int64_t file_size = filesystem_->GetFileSize(block_fd_.get()); + if (file_size == Filesystem::kBadFileSize) { + ICING_LOG(ERROR) << "Could not initialize main index. Bad file size."; + return false; + } + + if (file_size == 0) { + if (!CreateHeader()) { + ICING_LOG(ERROR) + << "Could not initialize main index. Unable to create header."; + return false; + } + } else { + if (!OpenHeader(file_size)) { + ICING_LOG(ERROR) + << "Could not initialize main index. Unable to open header."; + return false; + } + } + in_memory_freelists_.resize(header_block_->header()->num_index_block_infos); + + return true; +} + +bool FlashIndexStorage::CreateHeader() { + uint32_t block_size = SelectBlockSize(); + header_block_ = std::make_unique<HeaderBlock>(filesystem_, block_size); + // Initialize. + header_block_->header()->magic = HeaderBlock::Header::kMagic; + header_block_->header()->block_size = block_size; + header_block_->header()->last_indexed_docid = kInvalidDocumentId; + + // Work down from the largest posting list that fits in + // block_size. We don't care about locality of blocks because this + // is a flash index. + for (uint32_t posting_list_bytes = + IndexBlock::CalculateMaxPostingListBytes(block_size); + posting_list_bytes >= posting_list_utils::min_posting_list_size(); + posting_list_bytes /= 2) { + uint32_t aligned_posting_list_bytes = + (posting_list_bytes / sizeof(Hit) * sizeof(Hit)); + ICING_VLOG(1) << IcingStringUtil::StringPrintf( + "Block size %u: %u", header_block_->header()->num_index_block_infos, + aligned_posting_list_bytes); + + // Initialize free list to empty. + HeaderBlock::Header::IndexBlockInfo* block_info = + header_block_->AddIndexBlockInfo(); + if (block_info == nullptr) { + // This should never happen anyways. Min block size is 4k, so adding these + // IndexBlockInfos should never exceed the block size. + return false; + } + block_info->posting_list_bytes = aligned_posting_list_bytes; + block_info->free_list_block_index = kInvalidBlockIndex; + } + + // Write the header. + if (!header_block_->Write(block_fd_.get())) { + filesystem_->Truncate(block_fd_.get(), 0); + return false; + } + num_blocks_ = 1; + return true; +} + +bool FlashIndexStorage::OpenHeader(int64_t file_size) { + uint32_t block_size = SelectBlockSize(); + // Read and validate header. + ICING_ASSIGN_OR_RETURN( + HeaderBlock read_header, + HeaderBlock::Read(filesystem_, block_fd_.get(), block_size), false); + if (read_header.header()->magic != HeaderBlock::Header::kMagic) { + ICING_LOG(ERROR) << "Index header block wrong magic"; + return false; + } + if (file_size % read_header.header()->block_size != 0) { + ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( + "Index size %" PRIu64 " not a multiple of block size %u", file_size, + read_header.header()->block_size); + return false; + } + + if (file_size < static_cast<int64_t>(read_header.header()->block_size)) { + ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( + "Index size %" PRIu64 " shorter than block size %u", file_size, + read_header.header()->block_size); + return false; + } + + if (read_header.header()->block_size % getpagesize() != 0) { + ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( + "Block size %u is not a multiple of page size %d", + read_header.header()->block_size, getpagesize()); + return false; + } + num_blocks_ = file_size / read_header.header()->block_size; + if (block_size != read_header.header()->block_size) { + // The block_size changed? That's weird. But the old block_size is still + // valid (it must be some multiple of the new block_size). So reinitialize + // with that old block size. Using the old block size means that we can + // still use the main index, but reads/writes won't be as efficient in terms + // of flash IO because the 'blocks' that we're reading are actually multiple + // pages long. + ICING_LOG(ERROR) << "Block size of existing header (" + << read_header.header()->block_size + << ") does not match the requested block size (" + << block_size << "). Defaulting to existing block size " + << read_header.header()->block_size; + ICING_ASSIGN_OR_RETURN(HeaderBlock read_header, + HeaderBlock::Read(filesystem_, block_fd_.get(), + read_header.header()->block_size), + false); + } + header_block_ = std::make_unique<HeaderBlock>(std::move(read_header)); + + // Check for memory alignment on posting_list_bytes. See b/29983315. + // The issue of potential corruption to the header could also be handled by + // checksumming the header block. + for (int i = 0; i < header_block_->header()->num_index_block_infos; ++i) { + int posting_list_bytes = + header_block_->header()->index_block_infos[i].posting_list_bytes; + if (posting_list_bytes % sizeof(Hit) != 0) { + ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( + "Posting list size misaligned, index %u, size %u, hit %zu, " + "file_size %" PRIu64, + i, header_block_->header()->index_block_infos[i].posting_list_bytes, + sizeof(Hit), file_size); + return false; + } + } + return true; +} + +bool FlashIndexStorage::PersistToDisk() { + // First, write header. + if (!header_block_->Write(block_fd_.get())) { + ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( + "Write index header failed: %s", strerror(errno)); + return false; + } + + // Then sync. + return filesystem_->DataSync(block_fd_.get()); +} + +libtextclassifier3::StatusOr<PostingListHolder> +FlashIndexStorage::GetPostingList(PostingListIdentifier id) const { + ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(id.block_index())); + ICING_ASSIGN_OR_RETURN( + PostingListUsed posting_list, + block.GetAllocatedPostingList(id.posting_list_index())); + PostingListHolder holder = {std::move(posting_list), std::move(block), id}; + return holder; +} + +libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::GetIndexBlock( + int block_index) const { + if (block_index >= num_blocks_) { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Unable to create an index block at index %d when only %d blocks have " + "been allocated.", + block_index, num_blocks_)); + } + off_t offset = static_cast<off_t>(block_index) * block_size(); + return IndexBlock::CreateFromPreexistingIndexBlockRegion( + *filesystem_, index_filename_, offset, block_size()); +} + +libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock( + int block_index, uint32_t posting_list_size) const { + if (block_index >= num_blocks_) { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Unable to create an index block at index %d when only %d blocks have " + "been allocated.", + block_index, num_blocks_)); + } + off_t offset = static_cast<off_t>(block_index) * block_size(); + return IndexBlock::CreateFromUninitializedRegion( + *filesystem_, index_filename_, offset, block_size(), posting_list_size); +} + +int FlashIndexStorage::FindBestIndexBlockInfo( + uint32_t posting_list_bytes) const { + int i = header_block_->header()->num_index_block_infos - 1; + for (; i >= 0; i--) { + if (header_block_->header()->index_block_infos[i].posting_list_bytes >= + posting_list_bytes) { + return i; + } + } + return i; +} + +libtextclassifier3::StatusOr<PostingListHolder> +FlashIndexStorage::GetPostingListFromInMemoryFreeList(int block_info_index) { + // Get something from in memory free list. + ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id, + in_memory_freelists_[block_info_index].TryPop()); + // Remember, posting lists stored on the in-memory free list were never + // actually freed. So it will still contain a valid PostingListUsed. First, we + // need to free this posting list. + ICING_ASSIGN_OR_RETURN(IndexBlock block, + GetIndexBlock(posting_list_id.block_index())); + block.FreePostingList(posting_list_id.posting_list_index()); + + // Now, we can allocate a posting list from the same index block. It may not + // be the same posting list that was just freed, but that's okay. + ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index, + block.AllocatePostingList()); + posting_list_id = + PostingListIdentifier(posting_list_id.block_index(), posting_list_index, + posting_list_id.posting_list_index_bits()); + ICING_ASSIGN_OR_RETURN( + PostingListUsed posting_list, + block.GetAllocatedPostingList(posting_list_id.posting_list_index())); + PostingListHolder holder = {std::move(posting_list), std::move(block), + posting_list_id}; + return holder; +} + +libtextclassifier3::StatusOr<PostingListHolder> +FlashIndexStorage::GetPostingListFromOnDiskFreeList(int block_info_index) { + // Get something from the free list. + uint32_t block_index = header_block_->header() + ->index_block_infos[block_info_index] + .free_list_block_index; + if (block_index == kInvalidBlockIndex) { + return absl_ports::NotFoundError("No available entry in free list."); + } + + // Get the index block + ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(block_index)); + ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index, + block.AllocatePostingList()); + PostingListIdentifier posting_list_id = PostingListIdentifier( + block_index, posting_list_index, block.posting_list_index_bits()); + ICING_ASSIGN_OR_RETURN( + PostingListUsed posting_list, + block.GetAllocatedPostingList(posting_list_id.posting_list_index())); + if (!block.has_free_posting_lists()) { + RemoveFromOnDiskFreeList(block_index, block_info_index, &block); + } + PostingListHolder holder = {std::move(posting_list), std::move(block), + posting_list_id}; + return holder; +} + +libtextclassifier3::StatusOr<PostingListHolder> +FlashIndexStorage::AllocateNewPostingList(int block_info_index) { + uint32_t block_index = GrowIndex(); + if (block_index == kInvalidBlockIndex) { + return absl_ports::ResourceExhaustedError( + "Unable to grow the index further!"); + } + ICING_ASSIGN_OR_RETURN( + IndexBlock block, + CreateIndexBlock(block_index, header_block_->header() + ->index_block_infos[block_info_index] + .posting_list_bytes)); + ICING_ASSIGN_OR_RETURN(PostingListIndex posting_list_index, + block.AllocatePostingList()); + PostingListIdentifier posting_list_id = PostingListIdentifier( + block_index, posting_list_index, block.posting_list_index_bits()); + ICING_ASSIGN_OR_RETURN( + PostingListUsed posting_list, + block.GetAllocatedPostingList(posting_list_id.posting_list_index())); + if (block.has_free_posting_lists()) { + AddToOnDiskFreeList(block_index, block_info_index, &block); + } + PostingListHolder holder = {std::move(posting_list), std::move(block), + posting_list_id}; + return holder; +} + +libtextclassifier3::StatusOr<PostingListHolder> +FlashIndexStorage::AllocatePostingList(uint32_t min_posting_list_bytes) { + int max_block_size = IndexBlock::CalculateMaxPostingListBytes(block_size()); + if (min_posting_list_bytes > max_block_size) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Requested posting list size %d exceeds max posting list size %d", + min_posting_list_bytes, max_block_size)); + } + int best_block_info_index = FindBestIndexBlockInfo(min_posting_list_bytes); + + auto holder_or = GetPostingListFromInMemoryFreeList(best_block_info_index); + if (holder_or.ok()) { + return std::move(holder_or).ValueOrDie(); + } + + // Nothing in memory. Look for something in the block file. + holder_or = GetPostingListFromOnDiskFreeList(best_block_info_index); + if (holder_or.ok()) { + return std::move(holder_or).ValueOrDie(); + } + + return AllocateNewPostingList(best_block_info_index); +} + +void FlashIndexStorage::AddToOnDiskFreeList(uint32_t block_index, + int block_info_index, + IndexBlock* index_block) { + index_block->set_next_block_index(header_block_->header() + ->index_block_infos[block_info_index] + .free_list_block_index); + header_block_->header() + ->index_block_infos[block_info_index] + .free_list_block_index = block_index; +} + +void FlashIndexStorage::RemoveFromOnDiskFreeList(uint32_t block_index, + int block_info_index, + IndexBlock* index_block) { + // Cannot be used anymore. Move free ptr to the next block. + header_block_->header() + ->index_block_infos[block_info_index] + .free_list_block_index = index_block->next_block_index(); + index_block->set_next_block_index(kInvalidBlockIndex); +} + +void FlashIndexStorage::FreePostingList(PostingListHolder holder) { + uint32_t posting_list_bytes = holder.block.get_posting_list_bytes(); + int best_block_info_index = FindBestIndexBlockInfo(posting_list_bytes); + + // It *should* be guaranteed elsewhere that FindBestIndexBlockInfo will not + // return a value in >= in_memory_freelists_, but check regardless. If it + // doesn't fit for some reason, then put it in the Header free list instead. + if (has_in_memory_freelists_ && + best_block_info_index < in_memory_freelists_.size()) { + in_memory_freelists_[best_block_info_index].Push(holder.id); + } else { + bool was_full = !holder.block.has_free_posting_lists(); + holder.block.FreePostingList(holder.id.posting_list_index()); + // If this block was not already full, then it is already in the free list. + if (was_full) { + AddToOnDiskFreeList(holder.id.block_index(), best_block_info_index, + &holder.block); + } + } +} + +int FlashIndexStorage::GrowIndex() { + if (num_blocks_ >= kMaxBlockIndex) { + ICING_VLOG(1) << IcingStringUtil::StringPrintf("Reached max block index %u", + kMaxBlockIndex); + return kInvalidBlockIndex; + } + + // Grow the index file. + if (!filesystem_->Grow( + block_fd_.get(), + static_cast<uint64_t>(num_blocks_ + 1) * block_size())) { + ICING_VLOG(1) << IcingStringUtil::StringPrintf( + "Error growing index file: %s", strerror(errno)); + return kInvalidBlockIndex; + } + + return num_blocks_++; +} + +void FlashIndexStorage::FlushInMemoryFreeList() { + for (int i = 0; i < in_memory_freelists_.size(); ++i) { + FreeList& freelist = in_memory_freelists_.at(i); + auto freelist_elt_or = freelist.TryPop(); + while (freelist_elt_or.ok()) { + PostingListIdentifier freelist_elt = freelist_elt_or.ValueOrDie(); + // Remember, posting lists stored on the in-memory free list were never + // actually freed. So it will still contain a valid PostingListUsed. + // First, we need to free this posting list. + auto block_or = GetIndexBlock(freelist_elt.block_index()); + if (!block_or.ok()) { + // Can't read the block. Nothing to do here. This posting list will have + // to leak. Just proceed to the next freelist element. + freelist_elt_or = freelist.TryPop(); + continue; + } + IndexBlock block = std::move(block_or).ValueOrDie(); + bool was_full = !block.has_free_posting_lists(); + block.FreePostingList(freelist_elt.posting_list_index()); + // If this block was not already full, then it is already in the free + // list. + if (was_full) { + AddToOnDiskFreeList(freelist_elt.block_index(), /*block_info_index=*/i, + &block); + } + freelist_elt_or = freelist.TryPop(); + } + } +} + +// FreeList. +void FlashIndexStorage::FreeList::Push(PostingListIdentifier id) { + if (free_list_.size() >= kMaxSize) { + ICING_LOG(WARNING) + << "Freelist for posting lists of size (block_size / " + << (1u << id.posting_list_index_bits()) + << ") has reached max size. Dropping freed posting list [block_index:" + << id.block_index() + << ", posting_list_index:" << id.posting_list_index() << "]"; + return; + } + + free_list_.push_back(id); +} + +libtextclassifier3::StatusOr<PostingListIdentifier> +FlashIndexStorage::FreeList::TryPop() { + if (free_list_.empty()) { + return absl_ports::NotFoundError("No available entry in free list."); + } + + PostingListIdentifier id = free_list_.back(); + free_list_.pop_back(); + return id; +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h new file mode 100644 index 0000000..958f131 --- /dev/null +++ b/icing/index/main/flash-index-storage.h @@ -0,0 +1,275 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_FLASH_INDEX_STORAGE_H_ +#define ICING_INDEX_FLASH_INDEX_STORAGE_H_ + +#include <cstdint> +#include <memory> +#include <string> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/filesystem.h" +#include "icing/index/main/flash-index-storage-header.h" +#include "icing/index/main/index-block.h" +#include "icing/index/main/posting-list-free.h" +#include "icing/index/main/posting-list-identifier.h" +#include "icing/index/main/posting-list-used.h" +#include "icing/legacy/core/icing-packed-pod.h" +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { + +// The PostingListHolder struct exists to group together related PostingListUsed +// IndexBlock pairs and their ids. +struct PostingListHolder { + // PostingListUseds interpret data that they themselves do NOT own. The data + // being interpreted is stored on a flash block and its memory mapping is + // owned by the IndexBlock. As such, the lifecycle of the PostingListUsed must + // NOT exceed the lifecycle of the IndexBlock. + PostingListUsed posting_list; + IndexBlock block; + // The PostingListIdentifier, which identifies both the IndexBlock and the + // PostingListUsed, is also returned for convenience. + PostingListIdentifier id; +}; + +// The FlashIndexStorage class manages the actual file that makes up the index. +// It allocates IndexBlocks as needed and maintains freelists to prevent +// excessive block fragmentation. +// +// It maintains two types of free lists: +// 1. On-disk, Header free list - This free list is stored in the Header +// block. There is a free list for every possible posting list size. Each +// entry for a posting list size contains the block_index of the +// IndexBlock that starts the free list chain. Each IndexBlock in the free +// list chain stores the index of the next IndexBlock in the chain. +// 2. In-memory free list - Like the Header free list, there is a free list of +// every possible posting list size. This free list contains not just the +// block_index of the available IndexBlock, but also the posting_list_index +// of the available PostingListUsed within the IndexBlock. This is because, +// unlike the Header free list, PostingListUseds are not actually freed +// when added to this free list. +// +// Whether or not the in-memory free list is used can be chosen via the +// in_memory param to the Create factory function. +// +// The advantage of using the in-memory free list is that it reduces the amount +// of flash writes made while editing the index (because actually freeing the +// PostingLists would require writing to that flash block). The disadvantage is +// that it introduces code complexity and potentially leaks blocks if power is +// lost or if FlashIndexStorage is destroyed before emptying the free list. +class FlashIndexStorage { + public: + // Creates a FlashIndexStorage at index_filename. in_memory determines whether + // or not the FlashIndexStorage maintains an in-memory freelist in order to + // avoid writes to the on-disk freelist. + // + // RETURNS: + // - On success, a valid instance of FlashIndexStorage + // - INTERNAL error if unable to create a new header or read the existing + // one from disk. + static libtextclassifier3::StatusOr<FlashIndexStorage> Create( + const std::string& index_filename, const Filesystem* filesystem, + bool in_memory = true); + + // Retrieve the PostingList referred to by PostingListIdentifier. This posting + // list must have been previously allocated by a prior call to + // AllocatePostingList. + // + // RETURNS: + // - On success, a valid instance of PostingListHolder containing the + // requested PostingListUsed. + // - INVALID_ARGUMENT if id.posting_list_index() is out of bounds in the + // IndexBlock referred to by id.block_index() + // - INTERNAL_ERROR if unable to access the region in file. + libtextclassifier3::StatusOr<PostingListHolder> GetPostingList( + PostingListIdentifier id) const; + + // Allocates and returns a PostingListHolder containing a PostingListUsed that + // can fit min_posting_list_bytes. + // + // RETURNS: + // - On success, a valid instance of PostingListHolder containing the + // requested PostingListUsed. + // - RESOURCE_EXHAUSTED error if unable to grow the index to create a + // PostingListUsed of the requested size. + libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList( + uint32_t min_posting_list_bytes); + + ~FlashIndexStorage(); + FlashIndexStorage(FlashIndexStorage&&) = default; + FlashIndexStorage(const FlashIndexStorage&) = delete; + FlashIndexStorage& operator=(FlashIndexStorage&&) = default; + FlashIndexStorage& operator=(const FlashIndexStorage&) = delete; + + // Free the PostingListUsed that this holder holds. + void FreePostingList(PostingListHolder holder); + + // Used to track the largest docid indexed in the index. + DocumentId get_last_indexed_docid() const { + return header_block_->header()->last_indexed_docid; + } + void set_last_indexed_docid(DocumentId docid) { + header_block_->header()->last_indexed_docid = docid; + } + + // Updates the header and persists all changes to the index to disk. Returns + // true on success. + bool PersistToDisk(); + + // Returns the size of the index file in bytes. + int64_t GetDiskUsage() const { + return filesystem_->GetDiskUsage(block_fd_.get()); + } + + int num_blocks() const { return num_blocks_; } + + // Info about the index based on the block size. + int block_size() const { return header_block_->header()->block_size; } + + // Num blocks starts at 1 since the first block is the header. + bool empty() const { return num_blocks_ <= 1; } + + // The percentage of the maximum index size that is free. Allocated blocks are + // treated as fully used, even if they are only partially used. In this way, + // min_free_fraction is a lower bound of available space. + double min_free_fraction() const { + return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex; + } + + private: + FlashIndexStorage(const std::string& index_filename, + const Filesystem* filesystem, bool has_in_memory_freelists); + + // Init the index from persistence. Create if file does not exist. We do not + // erase corrupt files. + // + // Returns false if unable to create a new header or if the existing one is + // corrupt. + bool Init(); + + // Create or open the header block. Returns true on success. + bool InitHeader(); + + // Create a new header block for an empty index file. + bool CreateHeader(); + + // Loads the header stored at the beginning of the index file and validates + // the values stored in it. + bool OpenHeader(int64_t file_size); + + // Add the IndexBlock referred to by block_index in the on-disk free list with + // index block_info_index. + void AddToOnDiskFreeList(uint32_t block_index, int block_info_index, + IndexBlock* index_block); + + // Remove the IndexBlock referred to by block_index from the Header free list + // with index block_info_index. + void RemoveFromOnDiskFreeList(uint32_t block_index, int block_info_index, + IndexBlock* index_block); + + // Returns: + // - On success, a valid PostingListHolder created from the first entry of + // the in-memory freelist at block_info_index + // - NOT_FOUND if there was no entry in the freelist + // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be + // allocated for some reason. + libtextclassifier3::StatusOr<PostingListHolder> + GetPostingListFromInMemoryFreeList(int block_info_index); + + // Returns: + // - On success, a valid PostingListHolder created from the first entry of + // the on-disk freelist at block_info_index + // - NOT_FOUND if there was no entry in the freelist + // - RESOURCE_EXHAUSTED if the PostingList in the freelist couldn't be + // allocated for some reason. + libtextclassifier3::StatusOr<PostingListHolder> + GetPostingListFromOnDiskFreeList(int block_info_index); + + // Returns: + // - On success, a valid PostingListHolder created from a newly allocated + // IndexBlock. + // - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new + // IndexBlock. + libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList( + int block_info_index); + + // Returns: + // - On success, a newly created IndexBlock at block_index with posting + // lists of size posting_list_size + // - INTERNAL_ERROR if unable to access the region in file representing the + // IndexBlock + libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock( + int block_index, uint32_t posting_list_size) const; + + // Returns: + // - On success, the IndexBlock that exists at block_index + // - INTERNAL_ERROR if unable to access the region in file representing the + // IndexBlock + libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(int block_index) const; + + // Add a new block to the end of the file and return its block + // index. Returns kInvalidBlockIndex if unable to grow the index file. + int GrowIndex(); + + // Return the index into index_block_infos of the smallest posting_list free + // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds + // the max-sized posting list. + int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const; + + // Flushes the in-memory free list to disk. + void FlushInMemoryFreeList(); + + // Underlying filename. + std::string index_filename_; + + // We open the index file into this fd. + ScopedFd block_fd_; + int num_blocks_; // can be inferred from index file size + + std::unique_ptr<HeaderBlock> header_block_; + + // In-memory cache of free posting lists. + struct FreeList { + // Experimentally determined that high watermark for largest + // freelist was ~3500. + static constexpr size_t kMaxSize = 4096; + + // Push a new PostingListIdentifier if there is space. + void Push(PostingListIdentifier id); + + // Attempt to pop a PostingListIdentifier. + // + // RETURNS: + // - identifier of a free posting list, on success + // - NOT_FOUND if there are no free posting lists on this free list. + libtextclassifier3::StatusOr<PostingListIdentifier> TryPop(); + + private: + std::vector<PostingListIdentifier> free_list_; + }; + std::vector<FreeList> in_memory_freelists_; + + const Filesystem* filesystem_; // not owned; can't be null + + bool has_in_memory_freelists_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_FLASH_INDEX_STORAGE_H_ diff --git a/icing/index/main/flash-index-storage_test.cc b/icing/index/main/flash-index-storage_test.cc new file mode 100644 index 0000000..cf899b3 --- /dev/null +++ b/icing/index/main/flash-index-storage_test.cc @@ -0,0 +1,540 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/main/flash-index-storage.h" + +#include <stdlib.h> +#include <unistd.h> + +#include <algorithm> +#include <limits> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/index/hit/hit.h" +#include "icing/store/document-id.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAreArray; +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::IsFalse; +using ::testing::IsTrue; +using ::testing::Not; + +class FlashIndexStorageTest : public testing::Test { + protected: + void SetUp() override { + test_dir_ = GetTestTempDir() + "/test_dir"; + file_name_ = test_dir_ + "/test_file.idx.index"; + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str())); + } + + void TearDown() override { + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str())); + } + + protected: + std::string test_dir_; + std::string file_name_; + Filesystem filesystem_; +}; + +TEST_F(FlashIndexStorageTest, CorruptHeader) { + { + // Create the header file + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + } + { + // Read the valid header - should pass + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + } + { + // Corrupt the header file by changing pl_bytes + ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str())); + off_t offset = 16; + uint32_t pl_bytes = sizeof(Hit) - 1; // This is intentionally invalid + filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t)); + } + { + // Read the header file - should fail because pl_bytes is not divisible + // by sizeof(Hit), which is 5 as of writing + ASSERT_THAT(FlashIndexStorage::Create(file_name_, &filesystem_), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + } + { + // Correct the pl_bytes header alignment + ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str())); + off_t offset = 16; + uint32_t pl_bytes = 2 * sizeof(Hit); // Should be valid + filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t)); + } + { + // Read the valid header - should pass + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + } + + // Delete the file + filesystem_.DeleteFile(file_name_.c_str()); +} + +TEST_F(FlashIndexStorageTest, EmptyStorage) { + { + // Create the header file + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + // An 'empty' FlashIndexStorage should have: + // 1. One block allocated for the header + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1)); + EXPECT_THAT(flash_index_storage.empty(), IsTrue()); + // 2. The invalid DocumentId stored in its header + EXPECT_THAT(flash_index_storage.get_last_indexed_docid(), + Eq(kInvalidDocumentId)); + // 3. It's disk usage should be the equivalent of one block. + EXPECT_THAT(flash_index_storage.GetDiskUsage(), + Eq(flash_index_storage.block_size())); + } + { + // Read the valid header. All functions should return the same values. + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1)); + EXPECT_THAT(flash_index_storage.empty(), IsTrue()); + EXPECT_THAT(flash_index_storage.get_last_indexed_docid(), + Eq(kInvalidDocumentId)); + EXPECT_THAT(flash_index_storage.GetDiskUsage(), + Eq(flash_index_storage.block_size())); + } +} + +TEST_F(FlashIndexStorageTest, FreeListInMemory) { + // Create the header file + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + { + // 1. Request a PL that is 1/2 block size. Remember that block size also + // includes the BlockHeader. The BlockHeader isn't publicly visible, so we + // subtract 100 bytes to be sure. AllocatePostingList will round up from + // kHalfBlockPostingListSize to whatever the correct size is. + const int kHalfBlockPostingListSize = + (flash_index_storage.block_size() - 100) / 2; + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder1, + flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + PostingListIdentifier id1 = posting_list_holder1.id; + EXPECT_THAT(id1.is_valid(), IsTrue()); + // 2. The index file should have grown by exactly one flash block. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + + std::vector<Hit> hits1 = { + Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12), + Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19), + Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100), + Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)}; + for (const Hit& hit : hits1) { + ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder1.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); + + // 2. Get another PL. This should be on the same flash block. There should + // be no allocation. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder2, + flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue()); + // 2. The index file should not have grown. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + + std::vector<Hit> hits2 = { + Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12), + Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19), + Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100), + Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)}; + for (const Hit& hit : hits2) { + ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder2.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); + + // 3. Now, free the first posting list. This should add it to the free list + flash_index_storage.FreePostingList(std::move(posting_list_holder1)); + + // 4. Request another posting list. This should NOT grow the index because + // the first posting list is free. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder3, + flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue()); + // 2. The index file should not have grown. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + // 3. The returned posting list holder should have the same id as the + // first posting list holder. + EXPECT_THAT(posting_list_holder3.id.posting_list_index(), + Eq(id1.posting_list_index())); + EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index())); + // Make sure this pl is empty. The hits that used to be there should be + // gone. + EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + IsOkAndHolds(IsEmpty())); + std::vector<Hit> hits3 = { + Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62), + Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45), + Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12), + Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)}; + for (const Hit& hit : hits3) { + ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend()))); + } + EXPECT_THAT(flash_index_storage.GetDiskUsage(), + Eq(2 * flash_index_storage.block_size())); +} + +TEST_F(FlashIndexStorageTest, FreeListNotInMemory) { + // Create the header file + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_, /*in_memory=*/false)); + + { + // 1. Request a PL that is 1/2 block size. Remember that block size also + // includes the BlockHeader. The BlockHeader isn't publicly visible, so we + // subtract 100 bytes to be sure. AllocatePostingList will round up from + // kHalfBlockPostingListSize to whatever the correct size is. + const int kHalfBlockPostingListSize = + (flash_index_storage.block_size() - 100) / 2; + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder1, + flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + PostingListIdentifier id1 = posting_list_holder1.id; + EXPECT_THAT(id1.is_valid(), IsTrue()); + // 2. The index file should have grown by exactly one flash block. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + + std::vector<Hit> hits1 = { + Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12), + Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19), + Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100), + Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)}; + for (const Hit& hit : hits1) { + ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder1.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); + + // 2. Get another PL. This should be on the same flash block. There should + // be no allocation. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder2, + flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue()); + // 2. The index file should not have grown. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + + std::vector<Hit> hits2 = { + Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12), + Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19), + Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100), + Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)}; + for (const Hit& hit : hits2) { + ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder2.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); + + // 3. Now, free the first posting list. This should add it to the free list + flash_index_storage.FreePostingList(std::move(posting_list_holder1)); + + // 4. Request another posting list. This should NOT grow the index because + // the first posting list is free. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder3, + flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue()); + // 2. The index file should not have grown. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + // 3. The returned posting list holder should have the same id as the + // first posting list holder. + EXPECT_THAT(posting_list_holder3.id.posting_list_index(), + Eq(id1.posting_list_index())); + EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index())); + // Make sure this pl is empty. The hits that used to be there should be + // gone. + EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + IsOkAndHolds(IsEmpty())); + std::vector<Hit> hits3 = { + Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62), + Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45), + Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12), + Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)}; + for (const Hit& hit : hits3) { + ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend()))); + } + EXPECT_THAT(flash_index_storage.GetDiskUsage(), + Eq(2 * flash_index_storage.block_size())); +} + +TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) { + PostingListIdentifier id1 = PostingListIdentifier::kInvalid; + int half_block_posting_list_size = 0; + { + // Create the header file + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + + { + // 1. Request a PL that is 1/2 block size. Remember that block size also + // includes the BlockHeader. The BlockHeader isn't publicly visible, so we + // subtract 100 bytes to be sure. AllocatePostingList will round up from + // kHalfBlockPostingListSize to whatever the correct size is. + half_block_posting_list_size = (flash_index_storage.block_size() - 100) / 2; + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder1, + flash_index_storage.AllocatePostingList(half_block_posting_list_size)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + id1 = posting_list_holder1.id; + EXPECT_THAT(id1.is_valid(), IsTrue()); + // 2. The index file should have grown by exactly one flash block. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + + std::vector<Hit> hits1 = { + Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12), + Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19), + Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100), + Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)}; + for (const Hit& hit : hits1) { + ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder1.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); + + // 2. Get another PL. This should be on the same flash block. There should + // be no allocation. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder2, + flash_index_storage.AllocatePostingList(half_block_posting_list_size)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue()); + // 2. The index file should not have grown. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + + std::vector<Hit> hits2 = { + Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12), + Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19), + Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100), + Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)}; + for (const Hit& hit : hits2) { + ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder2.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); + + // 3. Now, free the first posting list. This should add it to the free list + flash_index_storage.FreePostingList(std::move(posting_list_holder1)); + } + + EXPECT_THAT(flash_index_storage.GetDiskUsage(), + Eq(2 * flash_index_storage.block_size())); + // 4. The FlashIndexStorage should go out of scope and flush the in-memory + // posting list to disk + } + + { + // Recreate the flash index. + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + + { + // 5. Request another posting list. This should NOT grow the index because + // the first posting list is free. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder3, + flash_index_storage.AllocatePostingList(half_block_posting_list_size)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue()); + // 2. The index file should not have grown. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + // 3. The returned posting list holder should have the same id as the + // first posting list holder. + EXPECT_THAT(posting_list_holder3.id.posting_list_index(), + Eq(id1.posting_list_index())); + EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index())); + // Make sure this pl is empty. The hits that used to be there should be + // gone. + EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + IsOkAndHolds(IsEmpty())); + std::vector<Hit> hits3 = { + Hit(/*section_id=*/7, /*document_id=*/1, /*score=*/62), + Hit(/*section_id=*/12, /*document_id=*/3, /*score=*/45), + Hit(/*section_id=*/11, /*document_id=*/18, /*score=*/12), + Hit(/*section_id=*/7, /*document_id=*/100, /*score=*/74)}; + for (const Hit& hit : hits3) { + ICING_ASSERT_OK(posting_list_holder3.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder3.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend()))); + } + EXPECT_THAT(flash_index_storage.GetDiskUsage(), + Eq(2 * flash_index_storage.block_size())); + } +} + +TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) { + // Create the header file + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + { + // 1. Request a PL that is 1/2 block size. Remember that block size also + // includes the BlockHeader. The BlockHeader isn't publicly visible, so we + // subtract 100 bytes to be sure. AllocatePostingList will round up from + // kHalfBlockPostingListSize to whatever the correct size is. + const int kHalfBlockPostingListSize = + (flash_index_storage.block_size() - 100) / 2; + const int kQuarterBlockPostingListSize = + (flash_index_storage.block_size() - 100) / 4; + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder1, + flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + PostingListIdentifier id1 = posting_list_holder1.id; + EXPECT_THAT(id1.is_valid(), IsTrue()); + // 2. The index file should have grown by exactly one flash block. + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + + std::vector<Hit> hits1 = { + Hit(/*section_id=*/1, /*document_id=*/0, /*score=*/12), + Hit(/*section_id=*/6, /*document_id=*/2, /*score=*/19), + Hit(/*section_id=*/5, /*document_id=*/2, /*score=*/100), + Hit(/*section_id=*/8, /*document_id=*/5, /*score=*/197)}; + for (const Hit& hit : hits1) { + ICING_ASSERT_OK(posting_list_holder1.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder1.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); + + // 2. Get a PL that is 1/4 block size. Even though a 1/4 block PL could + // theoretically fit in the same block, we'll allocate a new one because PLs + // on a block are required to be the same size. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder2, + flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue()); + // 2. The index file should have grown by one block. + EXPECT_THAT(posting_list_holder2.id.block_index(), + Not(Eq(id1.block_index()))); + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + + std::vector<Hit> hits2 = { + Hit(/*section_id=*/4, /*document_id=*/0, /*score=*/12), + Hit(/*section_id=*/8, /*document_id=*/4, /*score=*/19), + Hit(/*section_id=*/9, /*document_id=*/7, /*score=*/100), + Hit(/*section_id=*/6, /*document_id=*/7, /*score=*/197)}; + for (const Hit& hit : hits2) { + ICING_ASSERT_OK(posting_list_holder2.posting_list.PrependHit(hit)); + } + EXPECT_THAT(posting_list_holder2.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend()))); + + // 3. Request another 1/4 block-size posting list. This should NOT grow the + // index because there should be three free posting lists on block2. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder posting_list_holder3, + flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize)); + // We expect: + // 1. FlashIndexStorage will return a valid id. + EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue()); + // 2. The index file should have remained the same size as before and the + // third posting list holder should use the same block as the second + // posting list holder. + EXPECT_THAT(posting_list_holder3.id.block_index(), + Eq(posting_list_holder2.id.block_index())); + EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3)); + EXPECT_THAT(flash_index_storage.empty(), IsFalse()); + } + EXPECT_THAT(flash_index_storage.GetDiskUsage(), + Eq(3 * flash_index_storage.block_size())); +} + +TEST_F(FlashIndexStorageTest, AllocateTooLargePostingList) { + // Create the header file + ICING_ASSERT_OK_AND_ASSIGN( + FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name_, &filesystem_)); + + // Request a PL that is 2x block size. + const int kDoubleBlockSize = flash_index_storage.block_size() * 2; + EXPECT_THAT(flash_index_storage.AllocatePostingList(kDoubleBlockSize), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/index-block.cc b/icing/index/main/index-block.cc index 9d7df3c..652dbc6 100644 --- a/icing/index/main/index-block.cc +++ b/icing/index/main/index-block.cc @@ -105,11 +105,12 @@ IndexBlock::IndexBlock(MemoryMappedFile mmapped_block) posting_lists_start_ptr_(mmapped_block.mutable_region() + sizeof(BlockHeader)), block_size_in_bytes_(mmapped_block.region_size()), - mmapped_block_(std::move(mmapped_block)) {} + mmapped_block_( + std::make_unique<MemoryMappedFile>(std::move(mmapped_block))) {} libtextclassifier3::Status IndexBlock::Reset(int posting_list_bytes) { - ICING_RETURN_IF_ERROR(ValidatePostingListBytes(posting_list_bytes, - mmapped_block_.region_size())); + ICING_RETURN_IF_ERROR(ValidatePostingListBytes( + posting_list_bytes, mmapped_block_->region_size())); header_->free_list_posting_list_index = kInvalidPostingListIndex; header_->next_block_index = kInvalidBlockIndex; header_->posting_list_bytes = posting_list_bytes; diff --git a/icing/index/main/index-block.h b/icing/index/main/index-block.h index 1d17e34..edf9a79 100644 --- a/icing/index/main/index-block.h +++ b/icing/index/main/index-block.h @@ -20,6 +20,7 @@ #include <algorithm> #include <limits> +#include <memory> #include <string> #include <unordered_set> #include <vector> @@ -95,6 +96,12 @@ class IndexBlock { IndexBlock(IndexBlock&&) = default; IndexBlock& operator=(IndexBlock&&) = default; + ~IndexBlock() { + if (mmapped_block_ != nullptr) { + mmapped_block_->PersistToDisk(); + } + } + // Instantiate a PostingListUsed at posting_list_index with the existing // content in the IndexBlock. // @@ -206,7 +213,7 @@ class IndexBlock { uint32_t block_size_in_bytes_; // MemoryMappedFile used to interact with the underlying flash block. - MemoryMappedFile mmapped_block_; + std::unique_ptr<MemoryMappedFile> mmapped_block_; }; } // namespace lib diff --git a/icing/index/main/main-index-merger.cc b/icing/index/main/main-index-merger.cc new file mode 100644 index 0000000..724cf43 --- /dev/null +++ b/icing/index/main/main-index-merger.cc @@ -0,0 +1,225 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/main/main-index-merger.h" + +#include <cstring> +#include <memory> + +#include "icing/absl_ports/canonical_errors.h" +#include "icing/index/lite/term-id-hit-pair.h" +#include "icing/index/term-id-codec.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +class HitSelector { + public: + // Returns whether or not term_id_hit_pair has the same term_id, document_id and section_id + // as the previously selected hits. + bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) { + return prev_.term_id() == term_id_hit_pair.term_id() && + prev_.hit().document_id() == term_id_hit_pair.hit().document_id() && + prev_.hit().section_id() == term_id_hit_pair.hit().section_id(); + } + + // Merges term_id_hit_pair with previously added hits. + void SelectIfBetter(const TermIdHitPair& term_id_hit_pair) { + if (term_id_hit_pair.hit().is_prefix_hit()) { + SelectPrefixHitIfBetter(term_id_hit_pair); + } else { + SelectExactHitIfBetter(term_id_hit_pair); + } + prev_ = term_id_hit_pair; + } + + // Adds all valid, selected hits to hits starting at position pos in hits. + // Returns the offset in hits after the position of the last added hit. + // This function may add between 0-2 hits depending on whether the HitSelector + // holds both a valid exact hit and a valid prefix hit, one of those or none. + size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) { + // Given highest scoring prefix/exact hits for a given + // term+docid+sectionid, push needed hits into hits array at offset + // pos. Return new pos. + if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) { + // Output both if scores are unequal. Otherwise only exact hit is + // sufficient because 1) they have the same scores and 2) any prefix query + // will also accept an exact hit. + (*hits)[pos++] = best_exact_hit_; + if (best_prefix_hit_.hit().score() != best_exact_hit_.hit().score()) { + (*hits)[pos++] = best_prefix_hit_; + // Ensure sorted. + if (best_prefix_hit_.hit() < best_exact_hit_.hit()) { + std::swap((*hits)[pos - 1], (*hits)[pos - 2]); + } + } + } else if (best_prefix_hit_.hit().is_valid()) { + (*hits)[pos++] = best_prefix_hit_; + } else if (best_exact_hit_.hit().is_valid()) { + (*hits)[pos++] = best_exact_hit_; + } + + return pos; + } + + void Reset() { + best_prefix_hit_ = TermIdHitPair(); + best_exact_hit_ = TermIdHitPair(); + prev_ = TermIdHitPair(); + } + + private: + void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) { + if (!best_prefix_hit_.hit().is_valid() || + best_prefix_hit_.hit().score() < term_id_hit_pair.hit().score()) { + best_prefix_hit_ = term_id_hit_pair; + } + } + + void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) { + if (!best_exact_hit_.hit().is_valid() || + best_exact_hit_.hit().score() < term_id_hit_pair.hit().score()) { + best_exact_hit_ = term_id_hit_pair; + } + } + + TermIdHitPair best_prefix_hit_; + TermIdHitPair best_exact_hit_; + TermIdHitPair prev_; +}; + +// A helper function to dedupe hits stored in hits. Suppose that the lite index +// contained a single document with two hits in a single prefix section: "foot" +// and "fool". When expanded, there would be four hits: +// {"fo", docid0, sectionid0} +// {"fo", docid0, sectionid0} +// {"foot", docid0, sectionid0} +// {"fool", docid0, sectionid0} +// +// The first two are duplicates of each other. So, this function will dedupe +// and shrink hits to be: +// {"fo", docid0, sectionid0} +// {"foot", docid0, sectionid0} +// {"fool", docid0, sectionid0} +// +// When duplicates are encountered, we prefer the hit with the highest hit +// score. If there is both an exact and prefix hit for the same term, we prefer +// the exact hit, unless they have different scores, in which case we keep both +// them. +void DedupeHits(std::vector<TermIdHitPair>* hits) { + // Now all terms are grouped together and all hits for a term are sorted. + // Merge equivalent hits into one. + std::sort(hits->begin(), hits->end(), + [](const TermIdHitPair& lhs, const TermIdHitPair& rhs) { + return lhs.value() < rhs.value(); + }); + size_t current_offset = 0; + HitSelector hit_selector; + for (const TermIdHitPair& term_id_hit_pair : *hits) { + if (!hit_selector.IsEquivalentHit(term_id_hit_pair)) { + // We've reached a new hit. Insert the previously selected hits that we + // had accumulated and reset to add this new hit. + current_offset = hit_selector.InsertSelectedHits(current_offset, hits); + hit_selector.Reset(); + } + // Update best exact and prefix hit. + hit_selector.SelectIfBetter(term_id_hit_pair); + } + + // Push last. + current_offset = hit_selector.InsertSelectedHits(current_offset, hits); + + hits->resize(current_offset); +} + +// Based on experiments with full prefix expansion, the multiplier +// is ~4x. +constexpr int kAvgPrefixesPerTerm = 4; + +} // namespace + +libtextclassifier3::StatusOr<std::vector<TermIdHitPair>> +MainIndexMerger::TranslateAndExpandLiteHits( + const LiteIndex& lite_index, const TermIdCodec& term_id_codec, + const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs) { + std::vector<TermIdHitPair> hits; + if (lite_index.empty()) { + return hits; + } + // Reserve enough space for the average number of prefixes per term and the + // terms themselves. + hits.reserve(lite_index.size() * (kAvgPrefixesPerTerm + 1)); + + // Translate lite tvis to main tvis. + for (const TermIdHitPair& term_id_hit_pair : lite_index) { + uint32_t cur_term_id = term_id_hit_pair.term_id(); + ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term, + term_id_codec.DecodeTermInfo(cur_term_id)); + Hit hit(term_id_hit_pair.hit()); + + // 1. Translate and push original. + auto itr = + lexicon_merge_outputs.other_tvi_to_main_tvi.find(cur_decoded_term.tvi); + if (itr == lexicon_merge_outputs.other_tvi_to_main_tvi.cend()) { + // b/37273773 + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Trying to translate lite tvi %u that was never added to the lexicon", + cur_decoded_term.tvi)); + } + ICING_ASSIGN_OR_RETURN(uint32_t term_id, + term_id_codec.EncodeTvi(itr->second, TviType::MAIN)); + hits.emplace_back(term_id, hit); + + // 2. Expand hits in prefix sections. + if (hit.is_in_prefix_section()) { + // Hit was in a prefix section. Push prefixes. Turn on prefix bit. + auto itr_prefixes = + lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.find( + cur_decoded_term.tvi); + if (itr_prefixes == + lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.end()) { + ICING_VLOG(1) << "No necessary prefix expansion for " << cur_decoded_term.tvi; + continue; + } + // The tvis of all prefixes of this hit's term that appear in the main + // lexicon are between [prefix_tvis_buf[offset], + // prefix_tvis_buf[offset+len]). + size_t offset = itr_prefixes->second.first; + size_t len = itr_prefixes->second.second; + Hit prefix_hit(hit.section_id(), hit.document_id(), hit.score(), + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); + for (; offset < len; ++offset) { + // Take the tvi (in the main lexicon) of each prefix term. + uint32_t prefix_main_tvi = + lexicon_merge_outputs.prefix_tvis_buf[offset]; + // Convert it to a term_id. + ICING_ASSIGN_OR_RETURN( + uint32_t prefix_term_id, + term_id_codec.EncodeTvi(prefix_main_tvi, TviType::MAIN)); + // Create add an element for this prefix TermId and prefix Hit to hits. + hits.emplace_back(prefix_term_id, prefix_hit); + } + } + } + // 3. Remove any duplicate hits. + DedupeHits(&hits); + return hits; +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/main-index-merger.h b/icing/index/main/main-index-merger.h new file mode 100644 index 0000000..1413a8f --- /dev/null +++ b/icing/index/main/main-index-merger.h @@ -0,0 +1,49 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_ +#define ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_ + +#include <memory> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/index/lite/lite-index.h" +#include "icing/index/main/main-index.h" +#include "icing/index/term-id-codec.h" + +namespace icing { +namespace lib { + +// Class used to merge hits from the lite_index and lite_lexicon into main_index +// and main_lexicon. +class MainIndexMerger { + public: + // Retrieves all hits in the lite index, translates the term ids of each + // LiteIndex::Element and expands prefix hits based on the mapping from + // lexicon_merge_outputs.other_tvi_to_prefix_main_tvis. + // + // RETURNS: + // - OK on success + // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id + // that exceeds the max TermId + static libtextclassifier3::StatusOr<std::vector<TermIdHitPair>> + TranslateAndExpandLiteHits( + const LiteIndex& lite_index, const TermIdCodec& term_id_codec, + const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs); +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_ diff --git a/icing/index/main/main-index-merger_test.cc b/icing/index/main/main-index-merger_test.cc new file mode 100644 index 0000000..42b3266 --- /dev/null +++ b/icing/index/main/main-index-merger_test.cc @@ -0,0 +1,367 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "icing/index/main/main-index-merger.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/filesystem.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/main/doc-hit-info-iterator-term-main.h" +#include "icing/index/main/main-index-merger.h" +#include "icing/index/main/main-index.h" +#include "icing/index/term-id-codec.h" +#include "icing/index/term-property-id.h" +#include "icing/legacy/index/icing-dynamic-trie.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/schema/section.h" +#include "icing/store/namespace-id.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::UnorderedElementsAre; + +class MainIndexMergerTest : public testing::Test { + protected: + void SetUp() override { + index_dir_ = GetTestTempDir() + "/test_dir"; + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); + + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024); + ICING_ASSERT_OK_AND_ASSIGN(lite_index_, + LiteIndex::Create(options, &icing_filesystem_)); + + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + } + + void TearDown() override { + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); + } + + std::string index_dir_; + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + std::unique_ptr<LiteIndex> lite_index_; + std::unique_ptr<TermIdCodec> term_id_codec_; +}; + +constexpr NamespaceId kNamespace0 = 0; + +TEST_F(MainIndexMergerTest, TranslateTermNotAdded) { + // 1. Index two docs in the Lite Index: + // - Doc0 {"foot" is_in_prefix_section=FALSE} + // - Doc1 {"fool", is_in_prefix_section=FALSE} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_term_id, + term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t fool_tvi, + lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t fool_term_id, + term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); + + // 2. Build up a fake LexiconMergeOutputs + // This is some made up number that doesn't matter for this test. + uint32_t foot_main_tvi = 5; + + // Only create a mapping for 'foot'. Leave out the mapping for 'fool' + MainIndex::LexiconMergeOutputs lexicon_outputs; + lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi); + + // 3. TranslateAndExpand should fail because 'fool' doesn't have a main tvi + // mapping. + ASSERT_THAT(MainIndexMerger::TranslateAndExpandLiteHits( + *lite_index_, *term_id_codec_, lexicon_outputs), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(MainIndexMergerTest, PrefixExpansion) { + // 1. Index two docs in the Lite Index: + // - Doc0 {"foot" is_in_prefix_section=FALSE} + // - Doc1 {"fool", is_in_prefix_section=TRUE} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_term_id, + term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t fool_tvi, + lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t fool_term_id, + term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); + + // 2. Build up a fake LexiconMergeOutputs + // This is some made up number that doesn't matter for this test. + uint32_t foo_main_tvi = 12; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN)); + Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); + + uint32_t foot_main_tvi = 5; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_main_term_id, + term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN)); + uint32_t fool_main_tvi = 10; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t fool_main_term_id, + term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN)); + + MainIndex::LexiconMergeOutputs lexicon_outputs; + // Map "fool" to it's prefix hit for "foo". + lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi, + std::make_pair(0, 1)); + lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi); + lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi); + lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi); + + // 3. TranslateAndExpand should; + // a. Translate lite term ids to main term ids based on the map + // b. Expand 'fool' to have a hit for 'foo' + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermIdHitPair> expanded_elts, + MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_, + lexicon_outputs)); + EXPECT_THAT(expanded_elts, UnorderedElementsAre( + TermIdHitPair(foot_main_term_id, doc0_hit), + TermIdHitPair(fool_main_term_id, doc1_hit), + TermIdHitPair(foo_term_id, doc1_prefix_hit))); +} + +TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentScores) { + // 1. Index one doc in the Lite Index: + // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_term_id, + term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + + Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit)); + Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit)); + + // 2. Build up a fake LexiconMergeOutputs + // This is some made up number that doesn't matter for this test. + uint32_t foo_main_tvi = 12; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_main_term_id, + term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN)); + // The prefix hit for 'foot' should have the same score as the exact hit for + // 'foot'. + Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); + + uint32_t foot_main_tvi = 5; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_main_term_id, + term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN)); + + MainIndex::LexiconMergeOutputs lexicon_outputs; + // Map "foot" to it's prefix hit for "foo". + lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi, + std::make_pair(0, 1)); + lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi); + lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi); + lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi); + + // 3. TranslateAndExpand should; + // a. Translate lite term ids to main term ids based on the map + // b. Expand 'foot' to have a hit for 'foo' + // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot' + // because they have different scores. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermIdHitPair> expanded_elts, + MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_, + lexicon_outputs)); + EXPECT_THAT( + expanded_elts, + UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit), + TermIdHitPair(foo_main_term_id, foo_doc0_hit), + TermIdHitPair(foo_main_term_id, doc0_prefix_hit))); +} + +TEST_F(MainIndexMergerTest, DedupeWithExactSameScores) { + // 1. Index one doc in the Lite Index: + // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_term_id, + term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_tvi, + lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE)); + + Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit)); + Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit)); + + // 2. Build up a fake LexiconMergeOutputs + // This is some made up number that doesn't matter for this test. + uint32_t foo_main_tvi = 12; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_main_term_id, + term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN)); + + uint32_t foot_main_tvi = 5; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_main_term_id, + term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN)); + + MainIndex::LexiconMergeOutputs lexicon_outputs; + // Map "foot" to it's prefix hit for "foo". + lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi, + std::make_pair(0, 1)); + lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi); + lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi); + lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi); + + // 3. TranslateAndExpand should; + // a. Translate lite term ids to main term ids based on the map + // b. Expand 'foot' to have a hit for 'foo' + // c. Keep only the exact hit for 'foo' since they both have the same hit + // score. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermIdHitPair> expanded_elts, + MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_, + lexicon_outputs)); + EXPECT_THAT( + expanded_elts, + UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit), + TermIdHitPair(foo_main_term_id, foo_doc0_hit))); +} + +TEST_F(MainIndexMergerTest, DedupePrefixExpansion) { + // 1. Index one doc in the Lite Index: + // - Doc0 {"foot" "fool" is_in_prefix_section=TRUE} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_term_id, + term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t fool_tvi, + lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t fool_term_id, + term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE)); + + Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*score=*/57, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit)); + Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit)); + + // 2. Build up a fake LexiconMergeOutputs + // This is some made up number that doesn't matter for this test. + uint32_t foo_main_tvi = 12; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foo_term_id, + term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN)); + // The prefix hit should take the best score - MaxHitScore when merging these + // two. + Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true); + + uint32_t foot_main_tvi = 5; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t foot_main_term_id, + term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN)); + uint32_t fool_main_tvi = 10; + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t fool_main_term_id, + term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN)); + + MainIndex::LexiconMergeOutputs lexicon_outputs; + // Map "fool" to it's prefix hit for "foo" and "foot" to it's prefix hit for + // "foo". + lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi, + std::make_pair(0, 1)); + lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi); + lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi, + std::make_pair(1, 1)); + lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi); + lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi); + lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi); + + // 3. TranslateAndExpand should; + // a. Translate lite term ids to main term ids based on the map + // b. Expand 'foot' and 'fool' to have hits for 'foo' + // c. Merge the prefix hits from 'foot' and 'fool', taking the best hit + // score. + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermIdHitPair> expanded_elts, + MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_, + lexicon_outputs)); + EXPECT_THAT( + expanded_elts, + UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit), + TermIdHitPair(fool_main_term_id, fool_doc0_hit), + TermIdHitPair(foo_term_id, doc0_prefix_hit))); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc new file mode 100644 index 0000000..a0297c2 --- /dev/null +++ b/icing/index/main/main-index.cc @@ -0,0 +1,497 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "icing/index/main/main-index.h" + +#include <cstring> +#include <memory> + +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/index/term-id-codec.h" +#include "icing/index/term-property-id.h" +#include "icing/legacy/index/icing-dynamic-trie.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Finds the shortest,valid prefix term with prefix hits in lexicon for which +// "prefix" is a prefix. +// Returns a valid FindTermResult with found=true if either: +// 1. prefix exists as a term in lexicon. +// 2. the shortest, valid prefix in the lexicon exists and contains prefix +// hits. +// Returns a FindTermResult with found=false and undefined values of tvi and +// exact if no term was found. +struct FindTermResult { + // TVI of the term that was found. Undefined if found=false. + uint32_t tvi; + // Whether or not a valid term with prefix hits was found. + bool found; + // Whether or not that term is equal to 'prefix' + bool exact; +}; +FindTermResult FindShortestValidTermWithPrefixHits( + const IcingDynamicTrie* lexicon, const std::string& prefix) { + // For prefix indexing: when we are doing a prefix match for "prefix", find + // the tvi to the equivalent posting list. prefix's own posting list might not + // exist but one of its children acts as a proxy. + IcingDynamicTrie::PropertyReader hits_in_prefix_section( + *lexicon, GetHasHitsInPrefixSectionPropertyId()); + uint32_t tvi = 0; + bool found = false; + bool exact = false; + for (IcingDynamicTrie::Iterator it(*lexicon, prefix.c_str()); it.IsValid(); + it.Advance()) { + PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; + memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id)); + + // Posting list id might be invalid if this is also a backfill term. + // Suppose that the main index has two pre-existing prefix hits "foot" and + // "fool" - it will have a branch point posting list for "foo". Then, let's + // suppose that the other index adds hits for "foul", "four" and "far". This + // will result in branch points for "fo" and "f". + // If "fo" was added before "f", then the iterator would first give us "fo". + // "fo" will have an invalid posting_list_id because it hasn't been + // backfilled yet, so we need to continue iterating to "foo". + if (posting_list_id.is_valid()) { + exact = (prefix.size() == strlen(it.GetKey())); + tvi = it.GetValueIndex(); + // Found it. Does it have prefix hits? + found = exact || hits_in_prefix_section.HasProperty(tvi); + break; + } + } + FindTermResult result = {tvi, found, exact}; + return result; +} + +} // namespace + +libtextclassifier3::StatusOr<MainIndex> MainIndex::Create( + const std::string& index_filename, const Filesystem* filesystem, + const IcingFilesystem* icing_filesystem) { + ICING_RETURN_ERROR_IF_NULL(filesystem); + ICING_RETURN_ERROR_IF_NULL(icing_filesystem); + MainIndex main_index; + ICING_RETURN_IF_ERROR( + main_index.Init(index_filename, filesystem, icing_filesystem)); + return main_index; +} + +// TODO(b/139087650) : Migrate off of IcingFilesystem. +libtextclassifier3::Status MainIndex::Init( + const std::string& index_filename, const Filesystem* filesystem, + const IcingFilesystem* icing_filesystem) { + std::string flash_index_file = index_filename + "-main-index"; + ICING_ASSIGN_OR_RETURN( + FlashIndexStorage flash_index, + FlashIndexStorage::Create(flash_index_file, filesystem)); + flash_index_storage_ = + std::make_unique<FlashIndexStorage>(std::move(flash_index)); + + std::string lexicon_file = index_filename + "-main-lexicon"; + IcingDynamicTrie::RuntimeOptions runtime_options; + main_lexicon_ = std::make_unique<IcingDynamicTrie>( + lexicon_file, runtime_options, icing_filesystem); + IcingDynamicTrie::Options lexicon_options; + if (!main_lexicon_->CreateIfNotExist(lexicon_options) || + !main_lexicon_->Init()) { + return absl_ports::InternalError("Failed to initialize lexicon trie"); + } + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>> +MainIndex::GetAccessorForExactTerm(const std::string& term) { + PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; + if (!main_lexicon_->Find(term.c_str(), &posting_list_id)) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Term %s is not present in main lexicon.", term.c_str())); + } + ICING_ASSIGN_OR_RETURN(PostingListAccessor accessor, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_id)); + return std::make_unique<PostingListAccessor>(std::move(accessor)); +} + +libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult> +MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) { + bool exact = false; + // For prefix indexing: when we are doing a prefix match for + // "prefix", find the tvi to the equivalent posting list. prefix's + // own posting list might not exist but its shortest child acts as a proxy. + // + // For example, if there are only two hits in the index are prefix hits for + // "bar" and "bat", then both will appear on a posting list for "ba". "b" + // won't have a posting list, but "ba" will suffice. + IcingDynamicTrie::PropertyReader hits_in_prefix_section( + *main_lexicon_, GetHasHitsInPrefixSectionPropertyId()); + IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix.c_str()); + if (!main_itr.IsValid()) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Term: %s is not present in the main lexicon.", prefix.c_str())); + } + exact = (prefix.length() == strlen(main_itr.GetKey())); + + if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) { + // Found it, but it doesn't have prefix hits. Exit early. No need to + // retrieve the posting list because there's nothing there for us. + return libtextclassifier3::Status::OK; + } + PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; + memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id)); + ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_id)); + GetPrefixAccessorResult result = {std::make_unique<PostingListAccessor>(std::move(pl_accessor)), exact}; + return result; +} + +libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs> +MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) { + // Maps new branching points in main lexicon to the term such that + // branching_point_term is a prefix of term and there are no terms smaller + // than term and greater than branching_point_term. + std::string prefix; + LexiconMergeOutputs outputs; + for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/""); + other_term_itr.IsValid(); other_term_itr.Advance()) { + // If term were inserted in the main lexicon, what new branching would it + // create? (It always creates at most one.) + int prefix_len = main_lexicon_->FindNewBranchingPrefixLength( + other_term_itr.GetKey(), /*utf8=*/true); + if (prefix_len <= 0) { + continue; + } + prefix.assign(other_term_itr.GetKey(), prefix_len); + + // Figure out backfill tvi. Might not exist since all children terms could + // only contain hits from non-prefix sections. + // + // Ex. Suppose that the main lexicon contains "foot" and "fool" and that + // we're adding "foul". The new branching prefix will be "fo". The backfill + // prefix will be "foo" - all hits in prefix section on "foo" will need to + // be added to the new "fo" posting list later. + FindTermResult result = + FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix); + if (!result.found || result.exact) { + continue; + } + + // This is a new prefix that will need backfilling from its next-in-line + // posting list. This new prefix will have to have a posting list eventually + // so insert a default PostingListIdentifier as a placeholder. + uint32_t branching_prefix_tvi; + bool new_key; + PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; + if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_id, + &branching_prefix_tvi, false, &new_key)) { + return absl_ports::InternalError("Could not insert branching prefix"); + } + + // Backfills only contain prefix hits by default. So set these here but + // could be overridden when adding hits from the other index later. + if (!main_lexicon_->SetProperty(branching_prefix_tvi, + GetHasNoExactHitsPropertyId()) || + !main_lexicon_->SetProperty(branching_prefix_tvi, + GetHasHitsInPrefixSectionPropertyId())) { + return absl_ports::InternalError("Setting prefix prop failed"); + } + + outputs.backfill_map[branching_prefix_tvi] = result.tvi; + } + return outputs; +} + +libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs> +MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon, + LexiconMergeOutputs&& outputs) { + IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon); + for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/""); + other_term_itr.IsValid(); other_term_itr.Advance()) { + uint32_t new_main_tvi; + PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; + if (!main_lexicon_->Insert(other_term_itr.GetKey(), &posting_list_id, + &new_main_tvi, + /*replace=*/false)) { + return absl_ports::InternalError(absl_ports::StrCat( + "Could not insert term: ", other_term_itr.GetKey())); + } + + // Copy the properties from the other lexicon over to the main lexicon. + uint32_t other_tvi = other_term_itr.GetValueIndex(); + if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi, + new_main_tvi)) { + return absl_ports::InternalError(absl_ports::StrCat( + "Could not insert term: ", other_term_itr.GetKey())); + } + + // Add other to main mapping. + outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi); + } + return std::move(outputs); +} + +libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs> +MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon, + LexiconMergeOutputs&& outputs) { + IcingDynamicTrie::PropertyReader has_prefix_prop_reader( + other_lexicon, GetHasHitsInPrefixSectionPropertyId()); + if (!has_prefix_prop_reader.Exists()) { + return std::move(outputs); + } + std::string prefix; + for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/""); + other_term_itr.IsValid(); other_term_itr.Advance()) { + // Only expand terms that have hits in prefix sections. + if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) { + continue; + } + + // Get prefixes where there is already a branching point in the main + // lexicon. We skip prefixes which don't already have a branching point. + std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths( + other_term_itr.GetKey(), /*utf8=*/true); + + int buf_start = outputs.prefix_tvis_buf.size(); + // Add prefixes. + for (int prefix_length : prefix_lengths) { + if (prefix_length <= 0) { + continue; + } + + prefix.assign(other_term_itr.GetKey(), prefix_length); + uint32_t prefix_tvi; + bool new_key; + PostingListIdentifier posting_list_identifier = + PostingListIdentifier::kInvalid; + if (!main_lexicon_->Insert(prefix.c_str(), &posting_list_identifier, + &prefix_tvi, /*replace=*/false, &new_key)) { + return absl_ports::InternalError( + absl_ports::StrCat("Could not insert prefix: ", prefix)); + } + + // Prefix tvi will have hits in prefix section. + if (!main_lexicon_->SetProperty(prefix_tvi, + GetHasHitsInPrefixSectionPropertyId())) { + return absl_ports::InternalError( + "Setting has hits in prefix section prop failed"); + } + + // If it hasn't been added by non-prefix term insertions in + // AddBackfillBranchPoints and AddTerms, it is a prefix-only term. + if (new_key && !main_lexicon_->SetProperty( + prefix_tvi, GetHasNoExactHitsPropertyId())) { + return absl_ports::InternalError("Setting no exact hits prop failed"); + } + + outputs.prefix_tvis_buf.push_back(prefix_tvi); + } + + // Any prefixes added? Then add to map. + if (buf_start < outputs.prefix_tvis_buf.size()) { + outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = { + buf_start, outputs.prefix_tvis_buf.size() - buf_start}; + } + } + return std::move(outputs); +} + +bool MainIndex::CopyProperties( + const IcingDynamicTrie::PropertyReadersAll& prop_reader, + const IcingDynamicTrie& other_lexicon, uint32_t other_tvi, + uint32_t new_main_tvi) { + for (uint32_t property_id = 0; property_id < prop_reader.size(); + ++property_id) { + if (property_id == GetHasNoExactHitsPropertyId()) { + // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits + // for this term, then HasNoExactHits needs to be set to false in + // main_lexicon. If other_lexicon has no exact hits for this term, then + // HasNoExactHits in the main_lexicon should not be modified. + if (!prop_reader.HasProperty(property_id, other_tvi) && + !main_lexicon_->ClearProperty(new_main_tvi, property_id)) { + ICING_LOG(ERROR) << "Clearing HasNoExactHitsProperty failed"; + return false; + } + } else { + // If other_lexicon has this property set for this term, then that + // property needs to be set for the main_lexicon. If other_lexicon + // doesn't have this property set, then the property in the main lexicon + // should not be modified. + if (prop_reader.HasProperty(property_id, other_tvi) && + !main_lexicon_->SetProperty(new_main_tvi, property_id)) { + return false; + } + } + } + return true; +} + +libtextclassifier3::Status MainIndex::AddHits( + const TermIdCodec& term_id_codec, + std::unordered_map<uint32_t, uint32_t>&& backfill_map, + std::vector<TermIdHitPair>&& hits) { + if (hits.empty()) { + return libtextclassifier3::Status::OK; + } + uint32_t cur_term_id = hits[0].term_id(); + ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term, + term_id_codec.DecodeTermInfo(cur_term_id)); + // Iterate through all hits. If these hits are for a term that also needs + // backfill, then backfill first and then add the new hits. + size_t k_start = 0; + size_t k_end = 0; + while (k_start < hits.size()) { + uint32_t term_id = hits[k_end].term_id(); + while (term_id == cur_term_id && ++k_end < hits.size()) { + term_id = hits[k_end].term_id(); + } + + // Look for backfill. + PostingListIdentifier backfill_posting_list_id = + PostingListIdentifier::kInvalid; + auto itr = backfill_map.find(cur_decoded_term.tvi); + if (itr != backfill_map.end()) { + const void* value = main_lexicon_->GetValueAtIndex(itr->second); + memcpy(&backfill_posting_list_id, value, + sizeof(backfill_posting_list_id)); + backfill_map.erase(itr); + } + ICING_RETURN_IF_ERROR(AddHitsForTerm(cur_decoded_term.tvi, + backfill_posting_list_id, + &hits[k_start], k_end - k_start)); + cur_term_id = term_id; + ICING_ASSIGN_OR_RETURN(cur_decoded_term, + term_id_codec.DecodeTermInfo(cur_term_id)); + k_start = k_end; + } + + // Now copy remaining backfills. + ICING_VLOG(2) << IcingStringUtil::StringPrintf("Remaining backfills %zu", + backfill_map.size()); + for (auto other_tvi_main_tvi_pair : backfill_map) { + PostingListIdentifier backfill_posting_list_id = + PostingListIdentifier::kInvalid; + memcpy(&backfill_posting_list_id, + main_lexicon_->GetValueAtIndex(other_tvi_main_tvi_pair.second), + sizeof(backfill_posting_list_id)); + ICING_ASSIGN_OR_RETURN( + PostingListAccessor hit_accum, + PostingListAccessor::Create(flash_index_storage_.get())); + ICING_RETURN_IF_ERROR( + AddPrefixBackfillHits(backfill_posting_list_id, &hit_accum)); + PostingListAccessor::FinalizeResult result = + PostingListAccessor::Finalize(std::move(hit_accum)); + if (result.id.is_valid()) { + main_lexicon_->SetValueAtIndex(other_tvi_main_tvi_pair.first, &result.id); + } + } + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status MainIndex::AddHitsForTerm( + uint32_t tvi, PostingListIdentifier backfill_posting_list_id, + const TermIdHitPair* hit_elements, size_t len) { + // 1. Create a PostingListAccessor - either from the pre-existing block, if + // one exists, or from scratch. + PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; + memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(tvi), + sizeof(posting_list_id)); + std::unique_ptr<PostingListAccessor> pl_accessor; + if (posting_list_id.is_valid()) { + if (posting_list_id.block_index() >= flash_index_storage_->num_blocks()) { + ICING_LOG(ERROR) << IcingStringUtil::StringPrintf( + "Index dropped hits. Invalid block index %u >= %u", + posting_list_id.block_index(), flash_index_storage_->num_blocks()); + // TODO(b/159918304) : Consider revising the checksumming strategy in the + // main index. Providing some mechanism to check for corruption - either + // during initialization or some later time would allow us to avoid + // whack-a-mole with odd corruption issues like this one (b/62820689). + return absl_ports::InternalError( + "Valid posting list has an invalid block index!"); + } + ICING_ASSIGN_OR_RETURN(PostingListAccessor tmp, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_id)); + pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp)); + } else { + // New posting list. + ICING_ASSIGN_OR_RETURN( + PostingListAccessor tmp, + PostingListAccessor::Create(flash_index_storage_.get())); + pl_accessor = std::make_unique<PostingListAccessor>(std::move(tmp)); + } + + // 2. Backfill any hits if necessary. + if (backfill_posting_list_id.is_valid()) { + ICING_RETURN_IF_ERROR( + AddPrefixBackfillHits(backfill_posting_list_id, pl_accessor.get())); + } + + // 3. Add all the new hits. + for (int i = len - 1; i >= 0; --i) { + Hit hit = hit_elements[i].hit(); + ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(hit)); + } + + // 4. Finalize this posting list and put its identifier in the lexicon. + PostingListAccessor::FinalizeResult result = + PostingListAccessor::Finalize(std::move(*pl_accessor)); + if (result.id.is_valid()) { + main_lexicon_->SetValueAtIndex(tvi, &result.id); + } + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status MainIndex::AddPrefixBackfillHits( + PostingListIdentifier backfill_posting_list_id, + PostingListAccessor* hit_accum) { + ICING_ASSIGN_OR_RETURN( + PostingListAccessor backfill_accessor, + PostingListAccessor::CreateFromExisting(flash_index_storage_.get(), + backfill_posting_list_id)); + std::vector<Hit> backfill_hits; + ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp, + backfill_accessor.GetNextHitsBatch()); + while (!tmp.empty()) { + std::copy(tmp.begin(), tmp.end(), std::back_inserter(backfill_hits)); + ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor.GetNextHitsBatch()); + } + + Hit last_added_hit; + for (const Hit& hit : backfill_hits) { + // Skip hits from non-prefix-enabled sections. + if (!hit.is_in_prefix_section()) { + continue; + } + + // A backfill hit is a prefix hit in a prefix section. + const Hit backfill_hit(hit.section_id(), hit.document_id(), hit.score(), + /*is_in_prefix_section=*/true, + /*is_prefix_hit=*/true); + if (backfill_hit == last_added_hit) { + // Skip duplicate values due to overriding of the is_prefix flag. + continue; + } + last_added_hit = backfill_hit; + ICING_RETURN_IF_ERROR(hit_accum->PrependHit(backfill_hit)); + } + return libtextclassifier3::Status::OK; +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h new file mode 100644 index 0000000..79378ea --- /dev/null +++ b/icing/index/main/main-index.h @@ -0,0 +1,235 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_ +#define ICING_INDEX_MAIN_MAIN_INDEX_H_ + +#include <memory> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/filesystem.h" +#include "icing/index/lite/term-id-hit-pair.h" +#include "icing/index/main/flash-index-storage.h" +#include "icing/index/main/posting-list-accessor.h" +#include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-dynamic-trie.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +class MainIndex { + public: + // RETURNS: + // - valid instance of MainIndex, on success. + // - INTERNAL error if unable to create the lexicon or flash storage. + static libtextclassifier3::StatusOr<MainIndex> Create( + const std::string& index_filename, const Filesystem* filesystem, + const IcingFilesystem* icing_filesystem); + + // Get a PostingListAccessor that holds the posting list chain for 'term'. + // + // RETURNS: + // - On success, a valid PostingListAccessor + // - NOT_FOUND if term is not present in the main index. + libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>> + GetAccessorForExactTerm(const std::string& term); + + // Get a PostingListAccessor for 'prefix'. + // + // RETURNS: + // - On success, a result containing a valid PostingListAccessor. + // - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a + // prefix are present in the main index. + struct GetPrefixAccessorResult { + // A PostingListAccessor that holds the posting list chain for the term + // that best represents 'prefix' in the main index. + std::unique_ptr<PostingListAccessor> accessor; + // True if the returned posting list chain is for 'prefix' or false if the + // returned posting list chain is for a term for which 'prefix' is a prefix. + bool exact; + }; + libtextclassifier3::StatusOr<GetPrefixAccessorResult> + GetAccessorForPrefixTerm(const std::string& prefix); + + struct LexiconMergeOutputs { + // Maps from main_lexicon tvi for new branching point to the main_lexicon + // tvi for posting list whose hits must be backfilled. + std::unordered_map<uint32_t, uint32_t> backfill_map; + + // Maps from lexicon tvis to main_lexicon tvis. + std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi; + + // Maps from the lexicon tvi to the beginning position in + // prefix_tvis_buf and the length. + std::unordered_map<uint32_t, std::pair<int, int>> + other_tvi_to_prefix_main_tvis; + + // Stores tvis that are mapped to by other_tvi_to_prefix_tvis. + std::vector<uint32_t> prefix_tvis_buf; + }; + + // Merge the lexicon into the main lexicon and populate the data + // structures necessary to translate lite tvis to main tvis, track backfilling + // and expanding lite terms to prefix terms. + // + // RETURNS: + // - OK on success + // - INTERNAL on IO error while writing to the main lexicon. + libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon( + const IcingDynamicTrie& other_lexicon) { + // Backfill branch points need to be added first so that the backfill_map + // can be correctly populated. + ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs, + AddBackfillBranchPoints(other_lexicon)); + ICING_ASSIGN_OR_RETURN(outputs, + AddTerms(other_lexicon, std::move(outputs))); + // Non-backfill branch points need to be added last so that the mapping of + // newly added terms to prefix terms can be correctly populated (prefix + // terms might be branch points between two new terms or between a + // pre-existing term and a new term). + ICING_ASSIGN_OR_RETURN(outputs, + AddBranchPoints(other_lexicon, std::move(outputs))); + return outputs; + } + + // Add hits to the main index and backfill from existing posting lists to new + // backfill branch points. + // + // The backfill_map maps from main_lexicon tvi for a newly added branching + // point to the main_lexicon tvi for the posting list whose hits must be + // backfilled. backfill_map should be populated as part of LexiconMergeOutputs + // in MergeLexicon and be blindly passed to this function. + // + // RETURNS: + // - OK on success + // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id + // exceeds the max TermId, is not valid or is not less than pre-existing hits + // in the main index. + // - INTERNAL_ERROR if unable to mmap necessary IndexBlocks + // - RESOURCE_EXHAUSTED error if unable to grow the index + libtextclassifier3::Status AddHits( + const TermIdCodec& term_id_codec, + std::unordered_map<uint32_t, uint32_t>&& backfill_map, + std::vector<TermIdHitPair>&& hits); + + private: + libtextclassifier3::Status Init(const std::string& index_filename, + const Filesystem* filesystem, + const IcingFilesystem* icing_filesystem); + + // Helpers for merging the lexicon + // Add all 'backfill' branch points. Backfill branch points are prefix + // branch points that are a prefix of terms that existed in the lexicon + // to the merge. + // + // For example, if the main lexicon only contains "foot" and is then merged + // with a lite lexicon containing only "fool", then a backfill branch point + // for "foo" will be added to contain prefix hits from both the pre-existing + // posting list for "foot" and the new posting list for "fool". + // + // Populates LexiconMergeOutputs.backfill_map + // + // RETURNS: + // - OK on success + // - INTERNAL on IO error while writing to the main lexicon. + libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints( + const IcingDynamicTrie& other_lexicon); + + // Add all terms from the lexicon. + // + // Populates LexiconMergeOutputs.other_tvi_to_main_tvi + // + // RETURNS: + // - OK on success + // - INTERNAL on IO error while writing to the main lexicon. + libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms( + const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs); + + // Add all branch points for terms added from the lexicon. + // For example, if the main lexicon is empty and is then merged with a + // lexicon containing only "foot" and "fool", then a branch point for "foo" + // will be added to contain prefix hits from both "foot" and "fool". + // + // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and + // LexiconMergeOutputs.prefix_tvis_buf; + // + // RETURNS: + // - OK on success + // - INTERNAL on IO error while writing to the main lexicon. + libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints( + const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs); + + // Copies all properties from old_tvi in the other lexicon to the new_tvi in + // the main lexicon. + // Returns true on success, false if an IO error is encountered. + bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader, + const IcingDynamicTrie& other_lexicon, uint32_t other_tvi, + uint32_t new_main_tvi); + + // Add all hits between [hit_elements, hit_elements + len) to main_index, + // updating the entry in the main lexicon at trie_value_index to point to the + // resulting posting list. Hits are sorted in descending document id order, so + // they should be to posting lists in reverse (starting at hit_elements + // + len - 1) and working backwards. Therefore, hit_elements must be in sorted + // order. + // + // trie_value_index may point to a valid posting list id if there is a + // pre-existing posting list to append to. + // + // If backfill_posting_list_id is valid, then the hits from the posting list + // identified by backfill_posting_list_id should be added to the new posting + // list before the hits in hit_elements. + // + // RETURNS: + // - OK on success + // - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid + // but points out of bounds in the IndexBlock referred to by + // id.block_index(), if one of the hits from [hit_elements,hit_elements+len) + // is not valid, or if one of the hits from [hit_elements,hit_elements+len) + // is not less than the previously added hits. + // - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid + // but points to an invalid block index or if unable to mmap the IndexBlock. + // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new + // posting list. + libtextclassifier3::Status AddHitsForTerm( + uint32_t tvi, PostingListIdentifier backfill_posting_list_id, + const TermIdHitPair* hit_elements, size_t len); + + // Adds all prefix hits or hits from prefix sections present on the posting + // list identified by backfill_posting_list_id to hit_accum. + // + // RETURNS: + // - OK, on success + // - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the + // IndexBlock referred to by id.block_index() + // - INTERNAL_ERROR if unable to mmap the block identified by + // backfill_posting_list_id or if the posting list identified by + // backfill_posting_list_id has been corrupted. + // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new + // posting list. + libtextclassifier3::Status AddPrefixBackfillHits( + PostingListIdentifier backfill_posting_list_id, + PostingListAccessor* hit_accum); + + std::unique_ptr<FlashIndexStorage> flash_index_storage_; + std::unique_ptr<IcingDynamicTrie> main_lexicon_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_MAIN_MAIN_INDEX_H_ diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc new file mode 100644 index 0000000..019b588 --- /dev/null +++ b/icing/index/main/main-index_test.cc @@ -0,0 +1,536 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/main/main-index.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/file/filesystem.h" +#include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/lite/term-id-hit-pair.h" +#include "icing/index/main/doc-hit-info-iterator-term-main.h" +#include "icing/index/main/main-index-merger.h" +#include "icing/index/main/main-index.h" +#include "icing/index/term-id-codec.h" +#include "icing/index/term-property-id.h" +#include "icing/legacy/index/icing-dynamic-trie.h" +#include "icing/legacy/index/icing-filesystem.h" +#include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/schema/section.h" +#include "icing/store/namespace-id.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::IsEmpty; +using ::testing::NiceMock; +using ::testing::SizeIs; + +std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { + std::vector<DocHitInfo> infos; + while (iterator->Advance().ok()) { + infos.push_back(iterator->doc_hit_info()); + } + return infos; +} + +std::vector<DocHitInfo> GetExactHits( + MainIndex* main_index, const std::string& term, + SectionIdMask section_mask = kSectionIdMaskAll) { + auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>( + main_index, term, section_mask); + return GetHits(std::move(iterator)); +} + +std::vector<DocHitInfo> GetPrefixHits( + MainIndex* main_index, const std::string& term, + SectionIdMask section_mask = kSectionIdMaskAll) { + auto iterator = std::make_unique<DocHitInfoIteratorTermMainPrefix>( + main_index, term, section_mask); + return GetHits(std::move(iterator)); +} + +libtextclassifier3::Status Merge(const LiteIndex& lite_index, + const TermIdCodec& term_id_codec, + MainIndex* main_index) { + ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs, + main_index->MergeLexicon(lite_index.lexicon())); + ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> elts, + MainIndexMerger::TranslateAndExpandLiteHits( + lite_index, term_id_codec, outputs)); + return main_index->AddHits(term_id_codec, std::move(outputs.backfill_map), + std::move(elts)); +} + +class MainIndexTest : public testing::Test { + protected: + void SetUp() override { + index_dir_ = GetTestTempDir() + "/test_dir"; + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); + + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024); + ICING_ASSERT_OK_AND_ASSIGN(lite_index_, + LiteIndex::Create(options, &icing_filesystem_)); + + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + } + + void TearDown() override { + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); + } + + std::string index_dir_; + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + std::unique_ptr<LiteIndex> lite_index_; + std::unique_ptr<TermIdCodec> term_id_codec_; +}; + +constexpr NamespaceId kNamespace0 = 0; + +TEST_F(MainIndexTest, MainIndexCreateIOFailure) { + // Create the index with mock filesystem. By default, Mock will return false, + // so the first attempted file operation will fail. + NiceMock<IcingMockFilesystem> mock_filesystem; + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + EXPECT_THAT( + MainIndex::Create(main_index_file_name, &filesystem_, &mock_filesystem), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixTermNotFound) { + // Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + EXPECT_THAT(main_index.GetAccessorForPrefixTerm("foo"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) { + // 1. Index one doc in the Lite Index: + // - Doc0 {"foot" is_in_prefix_section=true} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + // 3. Merge the index. The main index should contain "foo". + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index)); + // GetAccessorForPrefixTerm should return a valid accessor for "foo". + EXPECT_THAT(main_index.GetAccessorForPrefixTerm("foo"), IsOk()); +} + +TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) { + // Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + EXPECT_THAT(main_index.GetAccessorForExactTerm("foo"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(MainIndexTest, MainIndexGetAccessorForExactReturnsValidAccessor) { + // 1. Index one doc in the Lite Index: + // - Doc0 {"foo" is_in_prefix_section=false} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + // 3. Merge the index. The main index should contain "foo". + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index)); + // GetAccessorForPrefixTerm should return a valid accessor for "foo". + EXPECT_THAT(main_index.GetAccessorForExactTerm("foo"), IsOk()); +} + +TEST_F(MainIndexTest, MergeIndexToEmpty) { + // 1. Index three docs in the Lite Index: + // - Doc0 {"foot", "fool", "far" is_in_prefix_section=false} + // - Doc1 {"foot", "fool" is_in_prefix_section=true} + // - Doc2 {"fool", "far" is_in_prefix_section=false} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, + lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit)); + + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); + + Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit)); + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + std::vector<DocHitInfo> hits = GetExactHits(&main_index, "foot"); + EXPECT_THAT(hits, IsEmpty()); + hits = GetPrefixHits(&main_index, "fo"); + EXPECT_THAT(hits, IsEmpty()); + + // 3. Merge the index. The main index should contain "fool", "foot" + // and "far" as well as a branch points for "foo" and "f". "fa" and "fo" + // should not be present because it is not a branch point. + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index)); + // Get hits from an exact posting list. + hits = GetExactHits(&main_index, "foot"); + // We should get hits for "foot" in doc1 and doc0 + EXPECT_THAT( + hits, + ElementsAre( + EqualsDocHitInfo(doc1_hit.document_id(), + std::vector<SectionId>{doc1_hit.section_id()}), + EqualsDocHitInfo(doc0_hit.document_id(), + std::vector<SectionId>{doc0_hit.section_id()}))); + + // Get hits from a branching point posting list. "fo" should redirect to "foo" + hits = GetPrefixHits(&main_index, "fo"); + // We should get hits for "foot" in doc1 and "fool" in doc1. We shouldn't get + // the hits for "foot" in doc0 and "fool" in doc0 and doc2 because they + // weren't hits in prefix sections. + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo( + doc1_hit.document_id(), + std::vector<SectionId>{doc1_hit.section_id()}))); +} + +TEST_F(MainIndexTest, MergeIndexToPreexisting) { + // 1. Index three docs in the Lite Index: + // - Doc0 {"foot", "fool", "far" is_in_prefix_section=false} + // - Doc1 {"foot", "fool" is_in_prefix_section=true} + // - Doc2 {"fool", "far" is_in_prefix_section=false} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, + lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit)); + + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit)); + + Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit)); + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + // 3. Merge the index. The main index should contain "fool", "foot" + // and "far" as well as a branch points for "foo" and "f". "fa" and "fo" + // should not be present because it is not a branch point. + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index)); + + // 4. Index two docs in a new Lite Index: + // - Doc3 {"foot", "four", "foul", "fall" is_in_prefix_section=false} + // - Doc4 {"four", "foul" is_in_prefix_section=true} + std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2"; + LiteIndex::Options options(lite_index_file_name2, + /*hit_buffer_want_merge_bytes=*/1024 * 1024); + ICING_ASSERT_OK_AND_ASSIGN(lite_index_, + LiteIndex::Create(options, &icing_filesystem_)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, + lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, lite_index_->InsertTerm("four", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t four_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, lite_index_->InsertTerm("foul", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foul_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, + lite_index_->InsertTerm("fall", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t fall_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc3_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc3_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc3_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(fall_term_id, doc3_hit)); + + Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc4_hit)); + ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc4_hit)); + + // 3. Merge the index. The main index should now contain "foul", "four" + // and "fall", a branch points for "fou" and backfill points for "fo". + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index)); + // Get hits from an exact posting list the existed before the merge. + std::vector<DocHitInfo> hits = GetExactHits(&main_index, "foot"); + + // We should get hits for "foot" in doc3, doc1 and doc0 + EXPECT_THAT( + hits, + ElementsAre( + EqualsDocHitInfo(doc3_hit.document_id(), + std::vector<SectionId>{doc3_hit.section_id()}), + EqualsDocHitInfo(doc1_hit.document_id(), + std::vector<SectionId>{doc1_hit.section_id()}), + EqualsDocHitInfo(doc0_hit.document_id(), + std::vector<SectionId>{doc0_hit.section_id()}))); + // Get hits from backfill posting list. + hits = GetPrefixHits(&main_index, "fo"); + // We should get hits for "four" and "foul" in doc4 and hits for "foot" and + // "fool" in doc1. We shouldn't get the hits for "foot" in doc0 and doc3, + // "fool" in doc0 and doc2 or the hits for "four" and "foul" in doc4 because + // they weren't hits in prefix sections. + EXPECT_THAT( + hits, + ElementsAre( + EqualsDocHitInfo(doc4_hit.document_id(), + std::vector<SectionId>{doc4_hit.section_id()}), + EqualsDocHitInfo(doc1_hit.document_id(), + std::vector<SectionId>{doc1_hit.section_id()}))); +} + +TEST_F(MainIndexTest, ExactRetrievedInPrefixSearch) { + // 1. Index two docs in the Lite Index: + // - Doc0 {"foot" is_in_prefix_section=true} + // - Doc1 {"foo" is_in_prefix_section=false} + // - Doc2 {"foot" is_in_prefix_section=false} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, + lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit)); + + Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc2_hit)); + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + // 3. Merge the lite lexicon. The main lexicon should contain "foot" and + // "foo". + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index)); + std::vector<DocHitInfo> hits = GetPrefixHits(&main_index, "foo"); + // We should get hits for "foo" in doc1 and doc0, but not in doc2 because it + // is not a prefix hit. + EXPECT_THAT( + hits, + ElementsAre( + EqualsDocHitInfo(doc1_hit.document_id(), + std::vector<SectionId>{doc1_hit.section_id()}), + EqualsDocHitInfo(doc0_hit.document_id(), + std::vector<SectionId>{doc0_hit.section_id()}))); +} + +TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) { + // 1. Index two docs in the Lite Index: + // - Doc0 {"foot" is_in_prefix_section=true} + // - Doc1 {"foo" is_in_prefix_section=false} + // - Doc1 {"foo" is_in_prefix_section=true} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + ICING_ASSERT_OK_AND_ASSIGN( + tvi, lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + + Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit)); + + Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kMaxHitScore, + /*is_in_prefix_section=*/true); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc2_hit)); + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + // 3. Merge the lite lexicon. The main lexicon should contain "foot" and + // "foo". + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index)); + std::vector<DocHitInfo> hits = GetExactHits(&main_index, "foo"); + + // We should get hits for "foo" in doc2 and doc1, but not in doc0 because it + // is not an exact hit. + EXPECT_THAT( + hits, + ElementsAre( + EqualsDocHitInfo(doc2_hit.document_id(), + std::vector<SectionId>{doc2_hit.section_id()}), + EqualsDocHitInfo(doc1_hit.document_id(), + std::vector<SectionId>{doc1_hit.section_id()}))); +} + +TEST_F(MainIndexTest, SearchChainedPostingLists) { + // Index 2048 document with 3 hits in each document. When merged into the main + // index, this will 1) lead to a chained posting list and 2) split at least + // one document's hits across multiple posting lists. + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + for (DocumentId document_id = 0; document_id < 2048; ++document_id) { + Hit doc_hit0(/*section_id=*/0, /*document_id=*/document_id, + Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0)); + + Hit doc_hit1(/*section_id=*/1, /*document_id=*/document_id, + Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1)); + + Hit doc_hit2(/*section_id=*/2, /*document_id=*/document_id, + Hit::kMaxHitScore, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2)); + } + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + MainIndex main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + // 3. Merge the lite index. + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, &main_index)); + // Get hits for all documents containing "foot" - which should be all of them. + std::vector<DocHitInfo> hits = GetExactHits(&main_index, "foot"); + + EXPECT_THAT(hits, SizeIs(2048)); + EXPECT_THAT(hits.front(), + EqualsDocHitInfo(2047, std::vector<SectionId>{0, 1, 2})); + EXPECT_THAT(hits.back(), + EqualsDocHitInfo(0, std::vector<SectionId>{0, 1, 2})); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/posting-list-accessor.cc b/icing/index/main/posting-list-accessor.cc new file mode 100644 index 0000000..a4f8ca7 --- /dev/null +++ b/icing/index/main/posting-list-accessor.cc @@ -0,0 +1,194 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/main/posting-list-accessor.h" + +#include <memory> + +#include "icing/absl_ports/canonical_errors.h" +#include "icing/index/main/flash-index-storage.h" +#include "icing/index/main/index-block.h" +#include "icing/index/main/posting-list-identifier.h" +#include "icing/index/main/posting-list-used.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +libtextclassifier3::StatusOr<PostingListAccessor> PostingListAccessor::Create( + FlashIndexStorage *storage) { + uint32_t max_posting_list_bytes = + IndexBlock::CalculateMaxPostingListBytes(storage->block_size()); + std::unique_ptr<uint8_t[]> posting_list_buffer_array = + std::make_unique<uint8_t[]>(max_posting_list_bytes); + ICING_ASSIGN_OR_RETURN( + PostingListUsed posting_list_buffer, + PostingListUsed::CreateFromUnitializedRegion( + posting_list_buffer_array.get(), max_posting_list_bytes)); + return PostingListAccessor(storage, std::move(posting_list_buffer_array), + std::move(posting_list_buffer)); +} + +libtextclassifier3::StatusOr<PostingListAccessor> +PostingListAccessor::CreateFromExisting( + FlashIndexStorage *storage, + PostingListIdentifier existing_posting_list_id) { + // Our posting_list_buffer_ will start as empty. + ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, Create(storage)); + ICING_ASSIGN_OR_RETURN(PostingListHolder holder, + storage->GetPostingList(existing_posting_list_id)); + pl_accessor.preexisting_posting_list_ = + std::make_unique<PostingListHolder>(std::move(holder)); + return pl_accessor; +} + +// Returns the next batch of hits for the provided posting list. +libtextclassifier3::StatusOr<std::vector<Hit>> +PostingListAccessor::GetNextHitsBatch() { + if (preexisting_posting_list_ == nullptr) { + if (has_reached_posting_list_chain_end_) { + return std::vector<Hit>(); + } + return absl_ports::FailedPreconditionError( + "Cannot retrieve hits from a PostingListAccessor that was not creaated " + "from a preexisting posting list."); + } + ICING_ASSIGN_OR_RETURN(std::vector<Hit> batch, + preexisting_posting_list_->posting_list.GetHits()); + uint32_t block_index = preexisting_posting_list_->block.next_block_index(); + if (block_index != kInvalidBlockIndex) { + PostingListIdentifier next_posting_list_id( + block_index, /*posting_list_index=*/0, + preexisting_posting_list_->block.posting_list_index_bits()); + ICING_ASSIGN_OR_RETURN(PostingListHolder holder, + storage_->GetPostingList(next_posting_list_id)); + preexisting_posting_list_ = + std::make_unique<PostingListHolder>(std::move(holder)); + } else { + has_reached_posting_list_chain_end_ = true; + preexisting_posting_list_.reset(); + } + return batch; +} + +libtextclassifier3::Status PostingListAccessor::PrependHit(const Hit &hit) { + PostingListUsed &active_pl = (preexisting_posting_list_ != nullptr) + ? preexisting_posting_list_->posting_list + : posting_list_buffer_; + libtextclassifier3::Status status = active_pl.PrependHit(hit); + if (!absl_ports::IsResourceExhausted(status)) { + return status; + } + // There is no more room to add hits to this current posting list! Therefore, + // we need to either move those hits to a larger posting list or flush this + // posting list and create another max-sized posting list in the chain. + if (preexisting_posting_list_ != nullptr) { + FlushPreexistingPostingList(); + } else { + ICING_RETURN_IF_ERROR(FlushInMemoryPostingList()); + } + + // Re-add hit. Should always fit since we just cleared posting_list_buffer_. + // It's fine to explicitly reference posting_list_buffer_ here because there's + // no way of reaching this line while preexisting_posting_list_ is still in + // use. + return posting_list_buffer_.PrependHit(hit); +} + +void PostingListAccessor::FlushPreexistingPostingList() { + if (preexisting_posting_list_->block.max_num_posting_lists() == 1) { + // If this is a max-sized posting list, then just keep track of the id for + // chaining. It'll be flushed to disk when preexisting_posting_list_ is + // destructed. + prev_block_identifier_ = preexisting_posting_list_->id; + } else { + // If this is NOT a max-sized posting list, then our hits have outgrown this + // particular posting list. Move the hits into the in-memory posting list + // and free this posting list. + // + // Move will always succeed since posting_list_buffer_ is max_pl_bytes. + posting_list_buffer_.MoveFrom(&preexisting_posting_list_->posting_list); + + // Now that all the contents of this posting list have been copied, there's + // no more use for it. Make it available to be used for another posting + // list. + storage_->FreePostingList(std::move(*preexisting_posting_list_)); + } + preexisting_posting_list_.reset(); +} + +libtextclassifier3::Status PostingListAccessor::FlushInMemoryPostingList() { + // We exceeded max_pl_bytes(). Need to flush posting_list_buffer_ and update + // the chain. + uint32_t max_posting_list_bytes = + IndexBlock::CalculateMaxPostingListBytes(storage_->block_size()); + ICING_ASSIGN_OR_RETURN(PostingListHolder holder, + storage_->AllocatePostingList(max_posting_list_bytes)); + holder.block.set_next_block_index(prev_block_identifier_.block_index()); + prev_block_identifier_ = holder.id; + return holder.posting_list.MoveFrom(&posting_list_buffer_); +} + +PostingListAccessor::FinalizeResult PostingListAccessor::Finalize( + PostingListAccessor accessor) { + if (accessor.preexisting_posting_list_ != nullptr) { + // Our hits are already in an existing posting list. Nothing else to do, but + // return its id. + FinalizeResult result = {libtextclassifier3::Status::OK, + accessor.preexisting_posting_list_->id}; + return result; + } + if (accessor.posting_list_buffer_.BytesUsed() <= 0) { + FinalizeResult result = {absl_ports::InvalidArgumentError( + "Can't finalize an empty PostingListAccessor. " + "There's nothing to Finalize!"), + PostingListIdentifier::kInvalid}; + return result; + } + uint32_t posting_list_bytes = + accessor.posting_list_buffer_.MinPostingListSizeToFit(); + if (accessor.prev_block_identifier_.is_valid()) { + posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + accessor.storage_->block_size()); + } + auto holder_or = accessor.storage_->AllocatePostingList(posting_list_bytes); + if (!holder_or.ok()) { + FinalizeResult result = {holder_or.status(), + accessor.prev_block_identifier_}; + return result; + } + PostingListHolder holder = std::move(holder_or).ValueOrDie(); + if (accessor.prev_block_identifier_.is_valid()) { + holder.block.set_next_block_index( + accessor.prev_block_identifier_.block_index()); + } + + // Move to allocated area. This should never actually return an error. We know + // that editor.posting_list() is valid because it wouldn't have successfully + // returned by AllocatePostingList if it wasn't. We know posting_list_buffer_ + // is valid because we created it in-memory. And finally, we know that the + // hits from posting_list_buffer_ will fit in editor.posting_list() because we + // requested it be at at least posting_list_bytes large. + auto status = holder.posting_list.MoveFrom(&accessor.posting_list_buffer_); + if (!status.ok()) { + FinalizeResult result = {std::move(status), + accessor.prev_block_identifier_}; + return result; + } + FinalizeResult result = {libtextclassifier3::Status::OK, holder.id}; + return result; +} + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/posting-list-accessor.h b/icing/index/main/posting-list-accessor.h new file mode 100644 index 0000000..e1bb3c0 --- /dev/null +++ b/icing/index/main/posting-list-accessor.h @@ -0,0 +1,168 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_POSTING_LIST_ACCESSOR_H_ +#define ICING_INDEX_POSTING_LIST_ACCESSOR_H_ + +#include <memory> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/index/hit/hit.h" +#include "icing/index/main/flash-index-storage.h" +#include "icing/index/main/posting-list-identifier.h" +#include "icing/index/main/posting-list-used.h" + +namespace icing { +namespace lib { + +// This class serves to: +// 1. Expose PostingListUseds to clients of FlashIndexStorage +// 2. Ensure the corresponding instance of IndexBlock has the same lifecycle as +// the instance of PostingListUsed that the client has access to, while +// not exposing IndexBlock's api surface. +// 3. Ensure that PostingListUseds can only be freed by calling methods which +// will also properly maintain the FlashIndexStorage free list and prevent +// callers from modifying the Posting List after freeing. + +// This class is used to provide a simple abstraction for adding hits to posting +// lists. PostingListAccessor handles 1) selection of properly-sized posting +// lists for the accumulated hits during Finalize() and 2) chaining of max-sized +// posting lists. +class PostingListAccessor { + public: + // Creates an empty PostingListAccessor. + // + // RETURNS: + // - On success, a valid instance of PostingListAccessor + // - INVALID_ARGUMENT error if storage has an invalid block_size. + static libtextclassifier3::StatusOr<PostingListAccessor> Create( + FlashIndexStorage* storage); + + // Create a PostingListAccessor with an existing posting list identified by + // existing_posting_list_id. + // + // The PostingListAccessor will add hits to this posting list until it is + // necessary either to 1) chain the posting list (if it is max-sized) or 2) + // move its hits to a larger posting list. + // + // RETURNS: + // - On success, a valid instance of PostingListAccessor + // - INVALID_ARGUMENT if storage has an invalid block_size. + static libtextclassifier3::StatusOr<PostingListAccessor> CreateFromExisting( + FlashIndexStorage* storage, + PostingListIdentifier existing_posting_list_id); + + // Retrieve the next batch of hits for the posting list chain + // + // RETURNS: + // - On success, a vector of hits in the posting list chain + // - INTERNAL if called on an instance of PostingListAccessor that was + // created via PostingListAccessor::Create, if unable to read the next + // posting list in the chain or if the posting list has been corrupted + // somehow. + libtextclassifier3::StatusOr<std::vector<Hit>> GetNextHitsBatch(); + + // Prepend one hit. This may result in flushing the posting list to disk (if + // the PostingListAccessor holds a max-sized posting list that is full) or + // freeing a pre-existing posting list if it is too small to fit all hits + // necessary. + // + // RETURNS: + // - OK, on success + // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the + // previously added hit. + // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new + // posting list. + libtextclassifier3::Status PrependHit(const Hit& hit); + + struct FinalizeResult { + // - OK on success + // - INVALID_ARGUMENT if there was no pre-existing posting list and no + // hits were added + // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a + // new posting list. + libtextclassifier3::Status status; + // Id of the posting list chain that was finalized. Guaranteed to be valid + // if status is OK. May be valid if status is non-OK, but previous blocks + // were written. + PostingListIdentifier id; + }; + // Write all accumulated hits to storage. + // + // If accessor points to a posting list chain with multiple posting lists in + // the chain and unable to write the last posting list in the chain, Finalize + // will return the error and also populate id with the id of the + // second-to-last posting list. + static FinalizeResult Finalize(PostingListAccessor accessor); + + private: + explicit PostingListAccessor( + FlashIndexStorage* storage, + std::unique_ptr<uint8_t[]> posting_list_buffer_array, + PostingListUsed posting_list_buffer) + : storage_(storage), + prev_block_identifier_(PostingListIdentifier::kInvalid), + posting_list_buffer_array_(std::move(posting_list_buffer_array)), + posting_list_buffer_(std::move(posting_list_buffer)), + has_reached_posting_list_chain_end_(false) {} + + // Flushes preexisting_posting_list_ to disk if it's a max-sized posting list + // and populates prev_block_identifier. + // If it's not a max-sized posting list, moves the contents of + // preexisting_posting_list_ to posting_list_buffer_ and frees + // preexisting_posting_list_. + // Sets preexisting_posting_list_ to nullptr. + void FlushPreexistingPostingList(); + + // Flushes posting_list_buffer_ to a max-sized posting list on disk, setting + // its next pointer to prev_block_identifier_ and updating + // prev_block_identifier_ to point to the just-written posting list. + libtextclassifier3::Status FlushInMemoryPostingList(); + + // Frees all posting lists in the posting list chain starting at + // prev_block_identifier_. + libtextclassifier3::Status FreePostingListChain(); + + FlashIndexStorage* storage_; // Does not own. + + // The PostingListIdentifier of the first max-sized posting list in the + // posting list chain or PostingListIdentifier::kInvalid if there is no + // posting list chain. + PostingListIdentifier prev_block_identifier_; + + // An editor to an existing posting list on disk. If available (non-NULL), + // we'll try to add all hits to this posting list. Once this posting list + // fills up, we'll either 1) chain it (if a max-sized posting list) and put + // future hits in posting_list_buffer_ or 2) copy all of its hits into + // posting_list_buffer_ and free this pl (if not a max-sized posting list). + // TODO(tjbarron) provide a benchmark to demonstrate the effects that re-using + // existing posting lists has on latency. + std::unique_ptr<PostingListHolder> preexisting_posting_list_; + + // In-memory posting list used to buffer hits before writing them to the + // smallest on-disk posting list that will fit them. + // posting_list_buffer_array_ owns the memory region that posting_list_buffer_ + // interprets. Therefore, posting_list_buffer_array_ must have the same + // lifecycle as posting_list_buffer_. + std::unique_ptr<uint8_t[]> posting_list_buffer_array_; + PostingListUsed posting_list_buffer_; + + bool has_reached_posting_list_chain_end_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_POSTING_LIST_ACCESSOR_H_ diff --git a/icing/index/main/posting-list-accessor_test.cc b/icing/index/main/posting-list-accessor_test.cc new file mode 100644 index 0000000..8a5ef07 --- /dev/null +++ b/icing/index/main/posting-list-accessor_test.cc @@ -0,0 +1,384 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/main/posting-list-accessor.h" + +#include <cstdint> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/index/hit/hit.h" +#include "icing/index/main/flash-index-storage.h" +#include "icing/index/main/index-block.h" +#include "icing/index/main/posting-list-identifier.h" +#include "icing/index/main/posting-list-used.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/hit-test-utils.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; +using ::testing::Eq; +using ::testing::Lt; +using ::testing::SizeIs; + +TEST(PostingListAccessorStorageTest, HitsAddAndRetrieveProperly) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + // Add some hits! Any hits! + std::vector<Hit> hits1 = + CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + for (const Hit& hit : hits1) { + ICING_ASSERT_OK(pl_accessor.PrependHit(hit)); + } + PostingListAccessor::FinalizeResult result = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_EXPECT_OK(result.status); + EXPECT_THAT(result.id.block_index(), Eq(1)); + EXPECT_THAT(result.id.posting_list_index(), Eq(0)); + + // Retrieve some hits. + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, + flash_index_storage.GetPostingList(result.id)); + EXPECT_THAT(pl_holder.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); + EXPECT_THAT(pl_holder.block.next_block_index(), Eq(kInvalidBlockIndex)); +} + +TEST(PostingListAccessorStorageTest, PreexistingPLKeepOnSameBlock) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + // Add a single hit. This will fit in a min-sized posting list. + Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kMaxHitScore); + ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); + PostingListAccessor::FinalizeResult result1 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_EXPECT_OK(result1.status); + // Should have been allocated to the first block. + EXPECT_THAT(result1.id.block_index(), Eq(1)); + EXPECT_THAT(result1.id.posting_list_index(), Eq(0)); + + // Add one more hit. The minimum size for a posting list must be able to fit + // at least two hits, so this should NOT cause the previous pl to be + // reallocated. + ICING_ASSERT_OK_AND_ASSIGN( + pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage, + result1.id)); + Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/1); + ICING_ASSERT_OK(pl_accessor.PrependHit(hit2)); + PostingListAccessor::FinalizeResult result2 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_EXPECT_OK(result2.status); + // Should have been allocated to the same posting list as the first hit. + EXPECT_THAT(result2.id, Eq(result1.id)); + + // The posting list at result2.id should hold all of the hits that have been + // added. + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, + flash_index_storage.GetPostingList(result2.id)); + EXPECT_THAT(pl_holder.posting_list.GetHits(), + IsOkAndHolds(ElementsAre(hit2, hit1))); +} + +TEST(PostingListAccessorStorageTest, PreexistingPLReallocateToLargerPL) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + // The smallest posting list size is 15 bytes. The first four hits will be + // compressed to one byte each and will be able to fit in the 5 byte padded + // region. The last hit will fit in one of the special hits. The posting list + // will be ALMOST_FULL and can fit at most 2 more hits. + std::vector<Hit> hits1 = + CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1); + for (const Hit& hit : hits1) { + ICING_ASSERT_OK(pl_accessor.PrependHit(hit)); + } + PostingListAccessor::FinalizeResult result1 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_EXPECT_OK(result1.status); + // Should have been allocated to the first block. + EXPECT_THAT(result1.id.block_index(), Eq(1)); + EXPECT_THAT(result1.id.posting_list_index(), Eq(0)); + + // Now let's add some more hits! + ICING_ASSERT_OK_AND_ASSIGN( + pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage, + result1.id)); + // The current posting list can fit at most 2 more hits. Adding 12 more hits + // should result in these hits being moved to a larger posting list. + std::vector<Hit> hits2 = CreateHits( + /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/12, + /*desired_byte_length=*/1); + + for (const Hit& hit : hits2) { + ICING_ASSERT_OK(pl_accessor.PrependHit(hit)); + } + PostingListAccessor::FinalizeResult result2 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_EXPECT_OK(result2.status); + // Should have been allocated to the second (new) block because the posting + // list should have grown beyond the size that the first block maintains. + EXPECT_THAT(result2.id.block_index(), Eq(2)); + EXPECT_THAT(result2.id.posting_list_index(), Eq(0)); + + // The posting list at result2.id should hold all of the hits that have been + // added. + for (const Hit& hit : hits2) { + hits1.push_back(hit); + } + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, + flash_index_storage.GetPostingList(result2.id)); + EXPECT_THAT(pl_holder.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend()))); +} + +TEST(PostingListAccessorStorageTest, MultiBlockChainsBlocksProperly) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + // Add some hits! Any hits! + std::vector<Hit> hits1 = + CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1); + for (const Hit& hit : hits1) { + ICING_ASSERT_OK(pl_accessor.PrependHit(hit)); + } + PostingListAccessor::FinalizeResult result1 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_EXPECT_OK(result1.status); + PostingListIdentifier second_block_id = result1.id; + // Should have been allocated to the second block, which holds a max-sized + // posting list. + EXPECT_THAT(second_block_id, Eq(PostingListIdentifier( + /*block_index=*/2, /*posting_list_index=*/0, + /*posting_list_index_bits=*/0))); + + // Now let's retrieve them! + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder pl_holder, + flash_index_storage.GetPostingList(second_block_id)); + // This pl_holder will only hold a posting list with the hits that didn't fit + // on the first block. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits, + pl_holder.posting_list.GetHits()); + ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size()))); + auto first_block_hits_start = hits1.rbegin() + second_block_hits.size(); + EXPECT_THAT(second_block_hits, + ElementsAreArray(hits1.rbegin(), first_block_hits_start)); + + // Now retrieve all of the hits that were on the first block. + uint32_t first_block_id = pl_holder.block.next_block_index(); + EXPECT_THAT(first_block_id, Eq(1)); + + PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0, + /*posting_list_index_bits=*/0); + ICING_ASSERT_OK_AND_ASSIGN(pl_holder, + flash_index_storage.GetPostingList(pl_id)); + EXPECT_THAT( + pl_holder.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend()))); +} + +TEST(PostingListAccessorStorageTest, + PreexistingMultiBlockReusesBlocksProperly) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + // Add some hits! Any hits! + std::vector<Hit> hits1 = + CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1); + for (const Hit& hit : hits1) { + ICING_ASSERT_OK(pl_accessor.PrependHit(hit)); + } + PostingListAccessor::FinalizeResult result1 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_EXPECT_OK(result1.status); + PostingListIdentifier first_add_id = result1.id; + EXPECT_THAT(first_add_id, Eq(PostingListIdentifier( + /*block_index=*/2, /*posting_list_index=*/0, + /*posting_list_index_bits=*/0))); + + // Now add a couple more hits. These should fit on the existing, not full + // second block. + ICING_ASSERT_OK_AND_ASSIGN( + pl_accessor, PostingListAccessor::CreateFromExisting(&flash_index_storage, + first_add_id)); + std::vector<Hit> hits2 = CreateHits( + /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/50, + /*desired_byte_length=*/1); + + for (const Hit& hit : hits2) { + ICING_ASSERT_OK(pl_accessor.PrependHit(hit)); + } + PostingListAccessor::FinalizeResult result2 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_EXPECT_OK(result2.status); + PostingListIdentifier second_add_id = result2.id; + EXPECT_THAT(second_add_id, Eq(first_add_id)); + + // We should be able to retrieve all 5050 hits. + for (const Hit& hit : hits2) { + hits1.push_back(hit); + } + ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder, + flash_index_storage.GetPostingList(second_add_id)); + // This pl_holder will only hold a posting list with the hits that didn't fit + // on the first block. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits, + pl_holder.posting_list.GetHits()); + ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size()))); + auto first_block_hits_start = hits1.rbegin() + second_block_hits.size(); + EXPECT_THAT(second_block_hits, + ElementsAreArray(hits1.rbegin(), first_block_hits_start)); + + // Now retrieve all of the hits that were on the first block. + uint32_t first_block_id = pl_holder.block.next_block_index(); + EXPECT_THAT(first_block_id, Eq(1)); + + PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0, + /*posting_list_index_bits=*/0); + ICING_ASSERT_OK_AND_ASSIGN(pl_holder, + flash_index_storage.GetPostingList(pl_id)); + EXPECT_THAT( + pl_holder.posting_list.GetHits(), + IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend()))); +} + +TEST(PostingListAccessorStorageTest, InvalidHitReturnsInvalidArgument) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + Hit invalid_hit; + EXPECT_THAT(pl_accessor.PrependHit(invalid_hit), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(PostingListAccessorStorageTest, HitsNotDecreasingReturnsInvalidArgument) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore); + ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); + + Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kMaxHitScore); + EXPECT_THAT(pl_accessor.PrependHit(hit2), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kMaxHitScore); + EXPECT_THAT(pl_accessor.PrependHit(hit3), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(PostingListAccessorStorageTest, NewPostingListNoHitsAdded) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + PostingListAccessor::FinalizeResult result1 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + EXPECT_THAT(result1.status, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(PostingListAccessorStorageTest, PreexistingPostingListNoHitsAdded) { + std::string test_dir = GetTestTempDir() + "/test_dir"; + std::string file_name = test_dir + "/test_file.idx.index"; + Filesystem filesystem; + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str())); + ASSERT_TRUE(filesystem.CreateDirectoryRecursively(test_dir.c_str())); + + ICING_ASSERT_OK_AND_ASSIGN(FlashIndexStorage flash_index_storage, + FlashIndexStorage::Create(file_name, &filesystem)); + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor, + PostingListAccessor::Create(&flash_index_storage)); + Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kMaxHitScore); + ICING_ASSERT_OK(pl_accessor.PrependHit(hit1)); + PostingListAccessor::FinalizeResult result1 = + PostingListAccessor::Finalize(std::move(pl_accessor)); + ICING_ASSERT_OK(result1.status); + + ICING_ASSERT_OK_AND_ASSIGN(PostingListAccessor pl_accessor2, + PostingListAccessor::CreateFromExisting( + &flash_index_storage, result1.id)); + PostingListAccessor::FinalizeResult result2 = + PostingListAccessor::Finalize(std::move(pl_accessor2)); + ICING_ASSERT_OK(result2.status); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/posting-list-identifier.cc b/icing/index/main/posting-list-identifier.cc new file mode 100644 index 0000000..1cdac65 --- /dev/null +++ b/icing/index/main/posting-list-identifier.cc @@ -0,0 +1,25 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/main/posting-list-identifier.h" + +namespace icing { +namespace lib { + +PostingListIdentifier PostingListIdentifier::kInvalid( + kInvalidBlockIndex, /*posting_list_index=*/0, + PostingListIdentifier::kEncodedPostingListIndexBits - 1); + +} // namespace lib +} // namespace icing diff --git a/icing/index/main/posting-list-identifier.h b/icing/index/main/posting-list-identifier.h new file mode 100644 index 0000000..4953865 --- /dev/null +++ b/icing/index/main/posting-list-identifier.h @@ -0,0 +1,116 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_POSTING_LIST_IDENTIFIER_H_ +#define ICING_INDEX_POSTING_LIST_IDENTIFIER_H_ + +#include "icing/index/main/index-block.h" +#include "icing/index/main/posting-list-free.h" +#include "icing/legacy/index/icing-bit-util.h" + +namespace icing { +namespace lib { + +// 1M blocks * 4K page size = 4GB index +inline constexpr int kBlockIndexBits = 20; +inline constexpr int kMaxBlockIndex = (1u << kBlockIndexBits) - 1; + +// Class used to store information necessary to identify any posting list within +// the index. +// +// The 20 leftmost bits in this identifier encode the block index. The 12 +// rightmost bits encode both the posting list index and the maximum number of +// bits required to encode a posting list index on that block. +// +// Ex. An index block containing a max of 68 posting lists each of size 60 +// bytes (and thus 7 posting list bits), with a block index of 13 and a posting +// list index of 5. +// 0000 0000 0000 0000 1101 1111 0000 0101 +// |__________block-index_______|__pad__|_pl-index_| +// +// "pad" is some region starting at kEncodedPostingListIndexBits (12) bit and +// continuing rightward until reaching a terminating "0". This padding encodes +// the posting list bits value - posting list bits value is the number of bits +// after the terminating '0' of the "pad" region. +// +// This value will eventually be stored in the Main Lexicon. +class PostingListIdentifier { + // 1 bit is wasted to encode max pl index bits so there can be at most 2^11 + // posting lists per block. Block size would have to be >=40020 bytes for + // there to be more than 2K+ posting lists in a block. + static constexpr int kEncodedPostingListIndexBits = 12; + static_assert(kEncodedPostingListIndexBits + kBlockIndexBits <= + 8 * sizeof(uint32_t), + "Not enough room in PostingListIdentifier value to encode " + "block index and posting list index."); + + public: + static PostingListIdentifier kInvalid; + + // 1. block_index - the index of this block within the FlashIndexStorage file + // 2. posting_list_index - the index of this posting list within the block + // 3. posting_list_index_bits - the number of bits needed to encode the + // largest posting_list_index that this block can have. + PostingListIdentifier(uint32_t block_index, + PostingListIndex posting_list_index, + int posting_list_index_bits) { + val_ = 0; + BITFIELD_OR(val_, /*offset=*/0, /*len=*/posting_list_index_bits, + /*val=*/static_cast<uint64_t>(posting_list_index)); + BITFIELD_OR( + val_, /*offset=*/posting_list_index_bits + 1, + /*len=*/kEncodedPostingListIndexBits - posting_list_index_bits - 1, + /*val=*/~0u); + BITFIELD_OR(val_, /*offset=*/kEncodedPostingListIndexBits, + /*len=*/kBlockIndexBits, + /*val=*/block_index); + } + + int block_index() const { + return BITFIELD_GET(val_, kEncodedPostingListIndexBits, kBlockIndexBits); + } + + PostingListIndex posting_list_index() const { + return BITFIELD_GET(val_, 0, posting_list_index_bits()); + } + + // Returns the maximum number of bits that a posting list index on the block + // referred to by block_index could use. + int posting_list_index_bits() const { + for (int bits = kEncodedPostingListIndexBits - 1; bits >= 0; --bits) { + if (((1u << bits) & val_) == 0) { + // Got to the zero bit. This is the start of pl index. + return bits; + } + } + return -1; + } + + bool is_valid() const { return *this != kInvalid; } + + bool operator==(const PostingListIdentifier& rhs) const { + return val_ == rhs.val_; + } + bool operator!=(const PostingListIdentifier& rhs) const { + return !(*this == rhs); + } + + private: + uint32_t val_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_POSTING_LIST_IDENTIFIER_H_ diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc index 4396007..71752dd 100644 --- a/icing/jni/icing-search-engine-jni.cc +++ b/icing/jni/icing-search-engine-jni.cc @@ -302,6 +302,24 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDeleteBySchemaType( } JNIEXPORT jbyteArray JNICALL +Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByQuery( + JNIEnv* env, jclass clazz, jlong native_pointer, + jbyteArray search_spec_bytes) { + icing::lib::IcingSearchEngine* icing = + GetIcingSearchEnginePointer(native_pointer); + + icing::lib::SearchSpecProto search_spec_proto; + if (!ParseProtoFromJniByteArray(env, search_spec_bytes, &search_spec_proto)) { + ICING_LOG(ERROR) << "Failed to parse SearchSpecProto in nativeSearch"; + return nullptr; + } + icing::lib::DeleteResultProto delete_result_proto = + icing->DeleteByQuery(search_spec_proto); + + return SerializeProtoToJniByteArray(env, delete_result_proto); +} + +JNIEXPORT jbyteArray JNICALL Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk( JNIEnv* env, jclass clazz, jlong native_pointer) { icing::lib::IcingSearchEngine* icing = diff --git a/icing/legacy/core/icing-string-util.cc b/icing/legacy/core/icing-string-util.cc index 1954cd3..2eb64ac 100644 --- a/icing/legacy/core/icing-string-util.cc +++ b/icing/legacy/core/icing-string-util.cc @@ -11,13 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -// Copyright 2011 Google Inc. All Rights Reserved. -// Author: ulas@google.com (Ulas Kirazci) -// sbanacho@google.com (Scott Banachowski) -// -// This is a list of IsGoogleLetter letters. It is copied from -// google3/util/utf8/proptables/letters.txt CL 19164202. #include "icing/legacy/core/icing-string-util.h" #include <stdarg.h> @@ -34,7 +27,6 @@ namespace icing { namespace lib { -namespace {} // namespace uint32_t IcingStringUtil::UpdateCrc32(uint32_t crc, const char *str, int len) { if (len > 0) { crc = ~crc32(~crc, reinterpret_cast<const Bytef *>(str), len); diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h index 4ea93ec..767e581 100644 --- a/icing/legacy/core/icing-string-util.h +++ b/icing/legacy/core/icing-string-util.h @@ -12,10 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Copyright 2011 Google Inc. All Rights Reserved. -// Author: ulas@google.com (Ulas Kirazci) -// sbanacho@google.com (Scott Banachowski) - #ifndef ICING_LEGACY_CORE_ICING_STRING_UTIL_H_ #define ICING_LEGACY_CORE_ICING_STRING_UTIL_H_ diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc index ee3d3a2..29843ba 100644 --- a/icing/legacy/index/icing-dynamic-trie.cc +++ b/icing/legacy/index/icing-dynamic-trie.cc @@ -96,14 +96,28 @@ using std::vector; namespace icing { namespace lib { +namespace { +constexpr uint32_t kInvalidNodeIndex = (1U << 24) - 1; +constexpr uint32_t kInvalidNextIndex = ~0U; + +// Returns the number of valid nexts in the array. +int GetValidNextsSize(IcingDynamicTrie::Next *next_array_start, + int next_array_length) { + int valid_nexts_length = 0; + for (; valid_nexts_length < next_array_length && + next_array_start[valid_nexts_length].node_index() != kInvalidNodeIndex; + ++valid_nexts_length) { + } + return valid_nexts_length; +} +} // namespace + // Based on the bit field widths. const uint32_t IcingDynamicTrie::Options::kMaxNodes = (1U << 24) - 1; const uint32_t IcingDynamicTrie::Options::kMaxNexts = (1U << 27) - 1; const uint32_t IcingDynamicTrie::Options::kMaxSuffixesSize = 1U << 27; const uint32_t IcingDynamicTrie::Options::kMaxValueSize = 1U << 16; -const uint32_t IcingDynamicTrie::kInvalidNodeIndex = (1U << 24) - 1; -const uint32_t IcingDynamicTrie::kInvalidNextIndex = ~0U; const uint32_t IcingDynamicTrie::kInvalidSuffixIndex = ~0U; const int IcingDynamicTrie::kMaxNextArraySize; @@ -891,7 +905,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init( bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray( uint8_t *buf, uint32_t buf_size) const { - uint32_t size = hdr.ByteSize(); + uint32_t size = hdr.ByteSizeLong(); if (size + sizeof(kMagic) + sizeof(uint32_t) > buf_size) return false; memcpy(buf, &kMagic, sizeof(kMagic)); memcpy(buf + sizeof(kMagic), &size, sizeof(uint32_t)); @@ -1502,6 +1516,53 @@ void IcingDynamicTrie::Clear() { deleted_bitmap_->Truncate(0); } +bool IcingDynamicTrie::ClearSuffixAndValue(uint32_t suffix_value_index) { + // The size 1 below is for a '\0' between the suffix and the value. + size_t suffix_and_value_length = + strlen(this->storage_->GetSuffix(suffix_value_index)) + 1 + + this->value_size(); + char *mutable_suffix_and_value = this->storage_->GetMutableSuffix( + suffix_value_index, suffix_and_value_length); + + if (mutable_suffix_and_value == nullptr) { + return false; + } + + memset(mutable_suffix_and_value, 0, suffix_and_value_length); + return true; +} + +bool IcingDynamicTrie::ResetNext(uint32_t next_index) { + Next *mutable_next = + this->storage_->GetMutableNextArray(next_index, /*len=*/1); + + if (mutable_next == nullptr) { + return false; + } + + mutable_next->set_val(0); + mutable_next->set_node_index(kInvalidNodeIndex); + return true; +} + +bool IcingDynamicTrie::SortNextArray(const Node *node) { + if (node == nullptr) { + // Nothing to sort, return success directly. + return true; + } + + uint32_t next_array_buffer_size = 1u << node->log2_num_children(); + Next *next_array_start = this->storage_->GetMutableNextArray( + node->next_index(), next_array_buffer_size); + + if (next_array_start == nullptr) { + return false; + } + + std::sort(next_array_start, next_array_start + next_array_buffer_size - 1); + return true; +} + bool IcingDynamicTrie::Insert(const char *key, const void *value, uint32_t *value_index, bool replace, bool *pnew_key) { @@ -1641,15 +1702,12 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value, new_leaf_node->set_log2_num_children(0); // Figure out the real length of the existing next array. - Next *cur_next = storage_->GetMutableNextArray( - best_node->next_index(), 1 << best_node->log2_num_children()); - int next_len = 0; - for (; next_len < (1 << best_node->log2_num_children()) && - cur_next[next_len].node_index() != kInvalidNodeIndex; - next_len++) { - } + uint32_t next_array_buffer_size = 1u << best_node->log2_num_children(); + Next *cur_next = storage_->GetMutableNextArray(best_node->next_index(), + next_array_buffer_size); + int next_len = GetValidNextsSize(cur_next, next_array_buffer_size); Next *new_next = cur_next; - if (next_len == (1 << best_node->log2_num_children())) { + if (next_len == (next_array_buffer_size)) { // Allocate a new, larger, array. new_next = storage_->AllocNextArray(next_len + 1); memcpy(new_next, cur_next, sizeof(Next) * next_len); @@ -2072,7 +2130,8 @@ const IcingDynamicTrie::Next *IcingDynamicTrie::LowerBound( } void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index, - int *key_offset, bool prefix) const { + int *key_offset, bool prefix, + bool utf8) const { // Find the best node such that: // // - If key is NOT in the trie, key[0..key_offset) is a prefix to @@ -2093,6 +2152,8 @@ void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index, const Node *cur_node = storage_->GetRootNode(); const char *cur_key = key; + const Node *utf8_node = cur_node; + const char *utf8_key = cur_key; while (!cur_node->is_leaf()) { const Next *found = GetNextByChar(cur_node, *cur_key); if (!found) break; @@ -2108,12 +2169,101 @@ void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index, break; } cur_key++; + + if (utf8 && i18n_utils::IsLeadUtf8Byte(*cur_key)) { + utf8_node = cur_node; + utf8_key = cur_key; + } + } + + if (utf8) { + // Rewind. + cur_node = utf8_node; + cur_key = utf8_key; } *best_node_index = storage_->GetNodeIndex(cur_node); *key_offset = reinterpret_cast<const char *>(cur_key) - key; } +int IcingDynamicTrie::FindNewBranchingPrefixLength(const char *key, + bool utf8) const { + if (storage_->empty()) { + return kNoBranchFound; + } + + uint32_t best_node_index; + int key_offset; + FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true, utf8); + const Node *cur_node = storage_->GetNode(best_node_index); + const char *cur_key = key + key_offset; + if (cur_node->is_leaf()) { + // Prefix in the trie. Split at leaf. + const char *prev_suffix = storage_->GetSuffix(cur_node->next_index()); + while (*prev_suffix != '\0' && *prev_suffix == *cur_key) { + prev_suffix++; + cur_key++; + } + + // Equal strings? No branching. + if (*prev_suffix == '\0' && *cur_key == '\0') { + return kNoBranchFound; + } + + if (utf8) { + // Rewind to utf8 boundary. + size_t offset = i18n_utils::SafeTruncateUtf8Length(key, cur_key - key); + cur_key = key + offset; + } + + return cur_key - key; + } else if (cur_node->log2_num_children() == 0) { + // Intermediate node going from no branching to branching. + return cur_key - key; + } + + // If we've reached this point, then we're already at a branch point. So there + // is no *new* branch point. + return kNoBranchFound; +} + +std::vector<int> IcingDynamicTrie::FindBranchingPrefixLengths(const char *key, + bool utf8) const { + std::vector<int> prefix_lengths; + + if (storage_->empty()) { + return prefix_lengths; + } + + const Node *cur_node = storage_->GetRootNode(); + const char *cur_key = key; + while (*cur_key && !cur_node->is_leaf()) { + // Branching prefix? + if (cur_node->log2_num_children() > 0) { + int len = cur_key - key; + if (utf8) { + // Do not cut mid-utf8. Walk up to utf8 boundary. + len = i18n_utils::SafeTruncateUtf8Length(key, len); + if (prefix_lengths.empty() || len != prefix_lengths.back()) { + prefix_lengths.push_back(len); + } + } else { + prefix_lengths.push_back(len); + } + } + + // Move to next. + const Next *found = GetNextByChar(cur_node, *cur_key); + if (found == nullptr) { + break; + } + cur_node = storage_->GetNode(found->node_index()); + + ++cur_key; + } + return prefix_lengths; +} + void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const { Stats stats; CollectStats(&stats); @@ -2248,6 +2398,102 @@ bool IcingDynamicTrie::ClearDeleted(uint32_t value_index) { return deleted_bitmap_->SetBit(idx, false); } +// Steps: +// 1. Find the key in the trie. +// 2. Remove the suffix and the value. +// 3. Reset the nexts that point to the nodes to be removed. +// 4. Sort any next array if needed. +bool IcingDynamicTrie::Delete(const std::string_view key) { + if (!is_initialized()) { + ICING_LOG(ERROR) << "DynamicTrie not initialized"; + return false; + } + + if (storage_->empty()) { + // Nothing to delete. + return true; + } + + // Tries to find the key in the trie, starting from the root. + const Node *current_node = storage_->GetRootNode(); + + // The node after which we start to remove data. + const Node *last_multichild_node = nullptr; + + // While visiting the trie nodes, we store the indices of Nexts that point + // to all the nodes after last_multichild_node. Those nodes must be + // consecutive and all have only one child. Resetting those Nexts means that + // we remove the data of the key. + std::vector<uint32_t> nexts_to_reset; + nexts_to_reset.reserve(key.length()); + + // Iterates through chars in the key, finds nodes in the trie until a leaf + // node is reached. The max number of loops is key.length() + 1 because we + // start from the root. + for (size_t i = 0; i <= key.length(); ++i) { + if (current_node->is_leaf()) { + // Leaf node, now check the suffix. + if (key.substr(i) != storage_->GetSuffix(current_node->next_index())) { + // Key does not exist in the trie, nothing to delete. + return true; + } + // Otherwise, key is found. + break; + } + + // Finds the next char. + const Next *next; + if (i == key.length()) { + // When we're at the end of the key, the next char is the termination char + // '\0'. + next = GetNextByChar(current_node, '\0'); + } else { + next = GetNextByChar(current_node, key[i]); + } + + if (next == nullptr) { + // Key does not exist in the trie, nothing to delete. + return true; + } + + // Checks the real size of next array. + uint32_t next_array_buffer_size = 1u << current_node->log2_num_children(); + Next *next_array_start = storage_->GetMutableNextArray( + current_node->next_index(), next_array_buffer_size); + int valid_next_array_size = + GetValidNextsSize(next_array_start, next_array_buffer_size); + if (valid_next_array_size == 0) { + // Key does not exist in the trie, nothing to delete. + // This shouldn't happen, but we put a sanity check here in case something + // is wrong. + return true; + } else if (valid_next_array_size == 1) { + // Single-child branch will be deleted. + nexts_to_reset.push_back(storage_->GetNextArrayIndex(next)); + } else { + // We see a new node with multiple children, all the previously seen nodes + // shouldn't be removed. + last_multichild_node = current_node; + nexts_to_reset.clear(); + nexts_to_reset.push_back(storage_->GetNextArrayIndex(next)); + } + + // Updates current_node. + current_node = storage_->GetNode(next->node_index()); + } + // Now we've found the key in the trie. + + ClearSuffixAndValue(current_node->next_index()); + + // Resets nexts to remove key information. + for (uint32_t next_index : nexts_to_reset) { + ResetNext(next_index); + } + SortNextArray(last_multichild_node); + + return true; +} + bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) { if (!is_initialized()) { ICING_LOG(FATAL) << "DynamicTrie not initialized"; diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h index c33be96..7fe290b 100644 --- a/icing/legacy/index/icing-dynamic-trie.h +++ b/icing/legacy/index/icing-dynamic-trie.h @@ -288,6 +288,16 @@ class IcingDynamicTrie : public IIcingStorage { // Empty out the trie without closing or removing. void Clear(); + // Clears the suffix and value at the given index. Returns true on success. + bool ClearSuffixAndValue(uint32_t suffix_value_index); + + // Resets the next at the given index so that it points to no node. + // Returns true on success. + bool ResetNext(uint32_t next_index); + + // Sorts the next array of the node. Returns true on success. + bool SortNextArray(const Node *node); + // Sync to disk. bool Sync() override; @@ -375,6 +385,16 @@ class IcingDynamicTrie : public IIcingStorage { bool is_full_match() const { return value_index != kInvalidValueIndex; } }; + static constexpr int kNoBranchFound = -1; + // Return prefix of any new branches created if key were inserted. If utf8 is + // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches + // would be created. + int FindNewBranchingPrefixLength(const char *key, bool utf8) const; + + // Find all prefixes of key where the trie branches. Excludes the key + // itself. If utf8 is true, does not cut key mid-utf8. + std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const; + void GetDebugInfo(int verbosity, std::string *out) const override; double min_free_fraction() const; @@ -402,6 +422,10 @@ class IcingDynamicTrie : public IIcingStorage { // Clears the deleted property for each value. bool ClearDeleted(uint32_t value_index); + // Deletes the entry associated with the key. Data can not be recovered after + // the deletion. Returns true on success. + bool Delete(std::string_view key); + // Clear a specific property id from all values. For each value that has this // property cleared, also check to see if it was the only property set; if // so, set the deleted property for the value to indicate it no longer has any @@ -575,8 +599,6 @@ class IcingDynamicTrie : public IIcingStorage { void GetHeader(IcingDynamicTrieHeader *hdr) const; void SetHeader(const IcingDynamicTrieHeader &new_hdr); - static const uint32_t kInvalidNodeIndex; - static const uint32_t kInvalidNextIndex; static const uint32_t kInvalidSuffixIndex; // Stats helpers. @@ -587,7 +609,7 @@ class IcingDynamicTrie : public IIcingStorage { const Next *LowerBound(const Next *start, const Next *end, uint8_t key_char) const; void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset, - bool prefix) const; + bool prefix, bool utf8 = false) const; // For value properties. This truncates the data by clearing it, but leaving // the storage intact. diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc index 4fae52a..193765b 100644 --- a/icing/legacy/index/icing-dynamic-trie_test.cc +++ b/icing/legacy/index/icing-dynamic-trie_test.cc @@ -746,6 +746,222 @@ TEST_F(IcingDynamicTrieTest, Compact) { } } +TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenRootIsLeaf) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + // Inserts a key, the root is a leaf. + uint32_t value = 1; + ASSERT_TRUE(trie.Insert("foo", &value)); + ASSERT_TRUE(trie.Find("foo", &value)); + + // Deletes the key. + EXPECT_TRUE(trie.Delete("foo")); + EXPECT_FALSE(trie.Find("foo", &value)); +} + +TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenLastCharIsLeaf) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + // Inserts "bar" and "ba", the trie structure looks like: + // root + // | + // b + // | + // a + // / \ + // null r + uint32_t value = 1; + ASSERT_TRUE(trie.Insert("bar", &value)); + ASSERT_TRUE(trie.Insert("ba", &value)); + ASSERT_TRUE(trie.Find("bar", &value)); + ASSERT_TRUE(trie.Find("ba", &value)); + + // Deletes "bar". "r" is a leaf node in the trie. + EXPECT_TRUE(trie.Delete("bar")); + EXPECT_FALSE(trie.Find("bar", &value)); + EXPECT_TRUE(trie.Find("ba", &value)); +} + +TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithTerminationNode) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + // Inserts "bar" and "ba", the trie structure looks like: + // root + // | + // b + // | + // a + // / \ + // null r + uint32_t value = 1; + ASSERT_TRUE(trie.Insert("bar", &value)); + ASSERT_TRUE(trie.Insert("ba", &value)); + ASSERT_TRUE(trie.Find("bar", &value)); + ASSERT_TRUE(trie.Find("ba", &value)); + + // Deletes "ba" which is a key with termination node in the trie. + EXPECT_TRUE(trie.Delete("ba")); + EXPECT_FALSE(trie.Find("ba", &value)); + EXPECT_TRUE(trie.Find("bar", &value)); +} + +TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleNexts) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + // Inserts "ba", "bb", "bc", and "bd", the trie structure looks like: + // root + // | + // b + // / | | \ + // a b c d + uint32_t value = 1; + ASSERT_TRUE(trie.Insert("ba", &value)); + ASSERT_TRUE(trie.Insert("bb", &value)); + ASSERT_TRUE(trie.Insert("bc", &value)); + ASSERT_TRUE(trie.Insert("bd", &value)); + ASSERT_TRUE(trie.Find("ba", &value)); + ASSERT_TRUE(trie.Find("bb", &value)); + ASSERT_TRUE(trie.Find("bc", &value)); + ASSERT_TRUE(trie.Find("bd", &value)); + + // Deletes "bc". + EXPECT_TRUE(trie.Delete("bc")); + EXPECT_FALSE(trie.Find("bc", &value)); + EXPECT_TRUE(trie.Find("ba", &value)); + EXPECT_TRUE(trie.Find("bb", &value)); + EXPECT_TRUE(trie.Find("bd", &value)); +} + +TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleTrieBranches) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + // Inserts "batter", "battle", and "bar", the trie structure looks like: + // root + // | + // b + // | + // a + // / \ + // t r + // | + // t + // / \ + // e l + // | | + // r e + uint32_t value = 1; + ASSERT_TRUE(trie.Insert("batter", &value)); + ASSERT_TRUE(trie.Insert("battle", &value)); + ASSERT_TRUE(trie.Insert("bar", &value)); + ASSERT_TRUE(trie.Find("batter", &value)); + ASSERT_TRUE(trie.Find("battle", &value)); + ASSERT_TRUE(trie.Find("bar", &value)); + + // Deletes "batter". + EXPECT_TRUE(trie.Delete("batter")); + EXPECT_FALSE(trie.Find("batter", &value)); + EXPECT_TRUE(trie.Find("battle", &value)); + EXPECT_TRUE(trie.Find("bar", &value)); +} + +TEST_F(IcingDynamicTrieTest, InsertionShouldWorkAfterDeletion) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + // Inserts some keys. + uint32_t value = 1; + ASSERT_TRUE(trie.Insert("bar", &value)); + ASSERT_TRUE(trie.Insert("bed", &value)); + ASSERT_TRUE(trie.Insert("foo", &value)); + + // Deletes a key + ASSERT_TRUE(trie.Delete("bed")); + ASSERT_FALSE(trie.Find("bed", &value)); + + // Inserts after deletion + EXPECT_TRUE(trie.Insert("bed", &value)); + EXPECT_TRUE(trie.Insert("bedroom", &value)); + EXPECT_TRUE(trie.Find("bed", &value)); + EXPECT_TRUE(trie.Find("bedroom", &value)); +} + +TEST_F(IcingDynamicTrieTest, IteratorShouldWorkAfterDeletion) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + // Inserts some keys. + uint32_t value = 1; + ASSERT_TRUE(trie.Insert("bar", &value)); + ASSERT_TRUE(trie.Insert("bed", &value)); + ASSERT_TRUE(trie.Insert("foo", &value)); + + // Deletes a key + ASSERT_TRUE(trie.Delete("bed")); + + // Iterates through all keys + IcingDynamicTrie::Iterator iterator_all(trie, ""); + std::vector<std::string> results; + for (; iterator_all.IsValid(); iterator_all.Advance()) { + results.emplace_back(iterator_all.GetKey()); + } + EXPECT_THAT(results, ElementsAre("bar", "foo")); + + // Iterates through keys that start with "b" + IcingDynamicTrie::Iterator iterator_b(trie, "b"); + results.clear(); + for (; iterator_b.IsValid(); iterator_b.Advance()) { + results.emplace_back(iterator_b.GetKey()); + } + EXPECT_THAT(results, ElementsAre("bar")); +} + +TEST_F(IcingDynamicTrieTest, DeletingNonExistingKeyShouldReturnTrue) { + IcingFilesystem filesystem; + IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(), + &filesystem); + ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options())); + ASSERT_TRUE(trie.Init()); + + // Inserts some keys. + uint32_t value = 1; + ASSERT_TRUE(trie.Insert("bar", &value)); + ASSERT_TRUE(trie.Insert("bed", &value)); + + // "ba" and bedroom are not keys in the trie. + EXPECT_TRUE(trie.Delete("ba")); + EXPECT_TRUE(trie.Delete("bedroom")); + + // The original keys are not affected. + EXPECT_TRUE(trie.Find("bar", &value)); + EXPECT_TRUE(trie.Find("bed", &value)); +} + } // namespace // The tests below are accessing private methods and fields of IcingDynamicTrie diff --git a/icing/legacy/index/icing-mock-filesystem.h b/icing/legacy/index/icing-mock-filesystem.h index 31e012a..5a064ea 100644 --- a/icing/legacy/index/icing-mock-filesystem.h +++ b/icing/legacy/index/icing-mock-filesystem.h @@ -31,65 +31,78 @@ namespace lib { class IcingMockFilesystem : public IcingFilesystem { public: - MOCK_CONST_METHOD1(DeleteFile, bool(const char *file_name)); + MOCK_METHOD(bool, DeleteFile, (const char *file_name), (const, override)); - MOCK_CONST_METHOD1(DeleteDirectory, bool(const char *dir_name)); + MOCK_METHOD(bool, DeleteDirectory, (const char *dir_name), (const, override)); - MOCK_CONST_METHOD1(DeleteDirectoryRecursively, bool(const char *dir_name)); + MOCK_METHOD(bool, DeleteDirectoryRecursively, (const char *dir_name), + (const, override)); - MOCK_CONST_METHOD1(FileExists, bool(const char *file_name)); + MOCK_METHOD(bool, FileExists, (const char *file_name), (const, override)); - MOCK_CONST_METHOD1(DirectoryExists, bool(const char *dir_name)); + MOCK_METHOD(bool, DirectoryExists, (const char *dir_name), (const, override)); - MOCK_CONST_METHOD1(GetBasenameIndex, int(const char *file_name)); + MOCK_METHOD(int, GetBasenameIndex, (const char *file_name), + (const, override)); - MOCK_CONST_METHOD1(GetBasename, std::string(const char *file_name)); + MOCK_METHOD(std::string, GetBasename, (const char *file_name), + (const, override)); - MOCK_CONST_METHOD1(GetDirname, std::string(const char *file_name)); + MOCK_METHOD(std::string, GetDirname, (const char *file_name), + (const, override)); - MOCK_CONST_METHOD2(ListDirectory, bool(const char *dir_name, - std::vector<std::string> *entries)); + MOCK_METHOD(bool, ListDirectory, + (const char *dir_name, std::vector<std::string> *entries), + (const, override)); - MOCK_CONST_METHOD2(GetMatchingFiles, - bool(const char *glob, std::vector<std::string> *matches)); + MOCK_METHOD(bool, GetMatchingFiles, + (const char *glob, std::vector<std::string> *matches), + (const, override)); - MOCK_CONST_METHOD1(OpenForWrite, int(const char *file_name)); + MOCK_METHOD(int, OpenForWrite, (const char *file_name), (const, override)); - MOCK_CONST_METHOD1(OpenForAppend, int(const char *file_name)); + MOCK_METHOD(int, OpenForAppend, (const char *file_name), (const, override)); - MOCK_CONST_METHOD1(OpenForRead, int(const char *file_name)); + MOCK_METHOD(int, OpenForRead, (const char *file_name), (const, override)); - MOCK_CONST_METHOD1(GetFileSize, uint64_t(int fd)); + MOCK_METHOD(uint64_t, GetFileSize, (int fd), (const, override)); - MOCK_CONST_METHOD1(GetFileSize, uint64_t(const char *filename)); + MOCK_METHOD(uint64_t, GetFileSize, (const char *filename), (const, override)); - MOCK_CONST_METHOD2(Truncate, bool(int fd, uint64_t new_size)); + MOCK_METHOD(bool, Truncate, (int fd, uint64_t new_size), (const, override)); - MOCK_CONST_METHOD2(Truncate, bool(const char *filename, uint64_t new_size)); + MOCK_METHOD(bool, Truncate, (const char *filename, uint64_t new_size), + (const, override)); - MOCK_CONST_METHOD2(Grow, bool(int fd, uint64_t new_size)); + MOCK_METHOD(bool, Grow, (int fd, uint64_t new_size), (const, override)); - MOCK_CONST_METHOD3(Write, bool(int fd, const void *data, size_t data_size)); - MOCK_CONST_METHOD4(PWrite, bool(int fd, off_t offset, const void *data, - size_t data_size)); + MOCK_METHOD(bool, Write, (int fd, const void *data, size_t data_size), + (const, override)); + MOCK_METHOD(bool, PWrite, + (int fd, off_t offset, const void *data, size_t data_size), + (const, override)); - MOCK_CONST_METHOD1(DataSync, bool(int fd)); + MOCK_METHOD(bool, DataSync, (int fd), (const, override)); - MOCK_CONST_METHOD2(RenameFile, - bool(const char *old_name, const char *new_name)); + MOCK_METHOD(bool, RenameFile, (const char *old_name, const char *new_name), + (const, override)); - MOCK_CONST_METHOD2(SwapFiles, bool(const char *one, const char *two)); + MOCK_METHOD(bool, SwapFiles, (const char *one, const char *two), + (const, override)); - MOCK_CONST_METHOD1(CreateDirectory, bool(const char *dir_name)); + MOCK_METHOD(bool, CreateDirectory, (const char *dir_name), (const, override)); - MOCK_CONST_METHOD1(CreateDirectoryRecursively, bool(const char *dir_name)); + MOCK_METHOD(bool, CreateDirectoryRecursively, (const char *dir_name), + (const, override)); - MOCK_CONST_METHOD2(CopyFile, bool(const char *src, const char *dst)); + MOCK_METHOD(bool, CopyFile, (const char *src, const char *dst), + (const, override)); - MOCK_CONST_METHOD4(ComputeChecksum, bool(int fd, uint32_t *checksum, - uint64_t offset, uint64_t length)); + MOCK_METHOD(bool, ComputeChecksum, + (int fd, uint32_t *checksum, uint64_t offset, uint64_t length), + (const, override)); - MOCK_CONST_METHOD1(GetDiskUsage, uint64_t(const char *path)); + MOCK_METHOD(uint64_t, GetDiskUsage, (const char *path), (const, override)); }; } // namespace lib diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index 000bf3a..29404d9 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -30,6 +30,7 @@ #include "icing/tokenization/language-segmenter-factory.h" #include "icing/transform/normalizer-factory.h" #include "icing/util/logging.h" +#include "unicode/uloc.h" // Run on a Linux workstation: // $ blaze build -c opt --dynamic_mode=off --copt=-gmlt @@ -107,8 +108,9 @@ void BM_QueryOneTerm(benchmark::State& state) { } std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir); + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); FakeClock fake_clock; @@ -219,8 +221,9 @@ void BM_QueryFiveTerms(benchmark::State& state) { } std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir); + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); FakeClock fake_clock; @@ -349,8 +352,9 @@ void BM_QueryDiacriticTerm(benchmark::State& state) { } std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir); + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); FakeClock fake_clock; @@ -464,8 +468,9 @@ void BM_QueryHiragana(benchmark::State& state) { } std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir); + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::unique_ptr<Normalizer> normalizer = CreateNormalizer(); FakeClock fake_clock; diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 36dbfd9..0d2c2c5 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -36,6 +36,7 @@ #include "icing/tokenization/language-segmenter-factory.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -59,8 +60,10 @@ class ResultRetrieverTest : public testing::Test { // File generated via icu_data_file rule in //icing/BUILD. icu_data_file_helper::SetUpICUDataFile( GetTestFilePath("icing/icu.dat"))); - ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + language_segmenter_, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(schema_store_, SchemaStore::Create(&filesystem_, test_dir_)); diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index 3b3bf61..676ea92 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -40,6 +40,7 @@ #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -60,8 +61,10 @@ class SnippetRetrieverTest : public testing::Test { // File generated via icu_data_file rule in //icing/BUILD. icu_data_file_helper::SetUpICUDataFile( GetTestFilePath("icing/icu.dat"))); - ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + language_segmenter_, + language_segmenter_factory::Create(std::move(options))); // Setup the schema ICING_ASSERT_OK_AND_ASSIGN(schema_store_, diff --git a/icing/store/document-filter-data.h b/icing/store/document-filter-data.h index 198bc49..3970132 100644 --- a/icing/store/document-filter-data.h +++ b/icing/store/document-filter-data.h @@ -25,6 +25,7 @@ namespace icing { namespace lib { using SchemaTypeId = int16_t; +inline constexpr SchemaTypeId kInvalidSchemaTypeId = -1; class DocumentFilterData { public: diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 93cebaa..7577f6b 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -329,8 +329,21 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() { auto iterator = document_log_->GetIterator(); auto iterator_status = iterator.Advance(); while (iterator_status.ok()) { - ICING_ASSIGN_OR_RETURN(DocumentWrapper document_wrapper, - document_log_->ReadProto(iterator.GetOffset())); + libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or = + document_log_->ReadProto(iterator.GetOffset()); + + if (absl_ports::IsNotFound(document_wrapper_or.status())) { + // The erased document still occupies 1 document id. + DocumentId new_document_id = document_id_mapper_->num_elements(); + ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id)); + iterator_status = iterator.Advance(); + continue; + } else if (!document_wrapper_or.ok()) { + return document_wrapper_or.status(); + } + + DocumentWrapper document_wrapper = + std::move(document_wrapper_or).ValueOrDie(); if (document_wrapper.deleted()) { if (!document_wrapper.document().uri().empty()) { // Individual document deletion. @@ -351,17 +364,22 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() { } } else if (!document_wrapper.document().namespace_().empty()) { // Namespace deletion. - ICING_RETURN_IF_ERROR(UpdateDerivedFilesNamespaceDeleted( - document_wrapper.document().namespace_())); - + ICING_ASSIGN_OR_RETURN( + NamespaceId namespace_id, + namespace_mapper_->Get(document_wrapper.document().namespace_())); + // Tombstone indicates it's a soft delete. + ICING_RETURN_IF_ERROR(BatchDelete(namespace_id, kInvalidSchemaTypeId, + /*soft_delete=*/true)); } else if (!document_wrapper.document().schema().empty()) { // SchemaType deletion. auto schema_type_id_or = schema_store_->GetSchemaTypeId( document_wrapper.document().schema()); if (schema_type_id_or.ok()) { - ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted( - schema_type_id_or.ValueOrDie())); + // Tombstone indicates it's a soft delete. + ICING_RETURN_IF_ERROR(BatchDelete(kInvalidNamespaceId, + schema_type_id_or.ValueOrDie(), + /*soft_delete=*/true)); } else { // The deleted schema type doesn't have a SchemaTypeId we can refer // to in the FilterCache. @@ -845,7 +863,8 @@ bool DocumentStore::DoesDocumentExist(DocumentId document_id) const { } libtextclassifier3::Status DocumentStore::Delete( - const std::string_view name_space, const std::string_view uri) { + const std::string_view name_space, const std::string_view uri, + bool soft_delete) { // Try to get the DocumentId first auto document_id_or = GetDocumentId(name_space, uri); if (!document_id_or.ok()) { @@ -865,25 +884,61 @@ libtextclassifier3::Status DocumentStore::Delete( ", uri: ", uri)); } + if (soft_delete) { + return SoftDelete(name_space, uri, document_id); + } else { + uint64_t document_log_offset = file_offset_or.ValueOrDie(); + return HardDelete(document_id, document_log_offset); + } +} + +libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id, + bool soft_delete) { + // Copy out the document to get namespace and uri. + ICING_ASSIGN_OR_RETURN(int64_t document_log_offset, + DoesDocumentExistAndGetFileOffset(document_id)); + + if (soft_delete) { + auto document_wrapper_or = document_log_->ReadProto(document_log_offset); + if (!document_wrapper_or.ok()) { + ICING_LOG(ERROR) << document_wrapper_or.status().error_message() + << "Failed to read from document log"; + return document_wrapper_or.status(); + } + DocumentWrapper document_wrapper = + std::move(document_wrapper_or).ValueOrDie(); + + return SoftDelete(document_wrapper.document().namespace_(), + document_wrapper.document().uri(), document_id); + } else { + return HardDelete(document_id, document_log_offset); + } +} + +libtextclassifier3::Status DocumentStore::SoftDelete( + std::string_view name_space, std::string_view uri, DocumentId document_id) { // Update ground truth first. - // To delete a proto we don't directly remove it. Instead, we mark it as - // deleted first by appending a tombstone of it and actually remove it from - // file later in Optimize() - // TODO(b/144458732): Implement a more robust version of ICING_RETURN_IF_ERROR - // that can support error logging. + // Mark the document as deleted by appending a tombstone of it and actually + // remove it from file later in Optimize() + // TODO(b/144458732): Implement a more robust version of + // ICING_RETURN_IF_ERROR that can support error logging. libtextclassifier3::Status status = document_log_->WriteProto(CreateDocumentTombstone(name_space, uri)) .status(); if (!status.ok()) { return absl_ports::Annotate( - status, absl_ports::StrCat("Failed to delete Document. namespace: ", + status, absl_ports::StrCat("Failed to delete Document. namespace:", name_space, ", uri: ", uri)); } - ICING_RETURN_IF_ERROR( - document_id_mapper_->Set(document_id_or.ValueOrDie(), kDocDeletedFlag)); + return document_id_mapper_->Set(document_id, kDocDeletedFlag); +} - return libtextclassifier3::Status::OK; +libtextclassifier3::Status DocumentStore::HardDelete( + DocumentId document_id, uint64_t document_log_offset) { + // Erases document proto. + ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset)); + return ClearDerivedData(document_id); } libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId( @@ -899,7 +954,14 @@ DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const { << " from score_cache_"; return score_data_or.status(); } - return *std::move(score_data_or).ValueOrDie(); + + DocumentAssociatedScoreData document_associated_score_data = + *std::move(score_data_or).ValueOrDie(); + if (document_associated_score_data.document_score() < 0) { + // An negative / invalid score means that the score data has been deleted. + return absl_ports::NotFoundError("Document score data not found."); + } + return document_associated_score_data; } libtextclassifier3::StatusOr<DocumentFilterData> @@ -910,135 +972,157 @@ DocumentStore::GetDocumentFilterData(DocumentId document_id) const { << " from filter_cache_"; return filter_data_or.status(); } - return *std::move(filter_data_or).ValueOrDie(); + DocumentFilterData document_filter_data = + *std::move(filter_data_or).ValueOrDie(); + if (document_filter_data.namespace_id() == kInvalidNamespaceId) { + // An invalid namespace id means that the filter data has been deleted. + return absl_ports::NotFoundError("Document filter data not found."); + } + return document_filter_data; } libtextclassifier3::Status DocumentStore::DeleteByNamespace( - std::string_view name_space) { + std::string_view name_space, bool soft_delete) { auto namespace_id_or = namespace_mapper_->Get(name_space); if (!namespace_id_or.ok()) { return absl_ports::Annotate( namespace_id_or.status(), - absl_ports::StrCat("Failed to delete by namespace. namespace: ", - name_space)); + absl_ports::StrCat("Failed to find namespace: ", name_space)); } + NamespaceId namespace_id = namespace_id_or.ValueOrDie(); - // Update ground truth first. - // To delete an entire namespace, we append a tombstone that only contains - // the deleted bit and the name of the deleted namespace. - // TODO(b/144458732): Implement a more robust version of - // ICING_RETURN_IF_ERROR that can support error logging. - libtextclassifier3::Status status = - document_log_->WriteProto(CreateNamespaceTombstone(name_space)).status(); - if (!status.ok()) { - ICING_LOG(ERROR) << status.error_message() - << "Failed to delete namespace. namespace = " - << name_space; - return status; + int num_updated_documents = 0; + if (soft_delete) { + // To delete an entire namespace, we append a tombstone that only contains + // the deleted bit and the name of the deleted namespace. + // TODO(b/144458732): Implement a more robust version of + // ICING_RETURN_IF_ERROR that can support error logging. + libtextclassifier3::Status status = + document_log_->WriteProto(CreateNamespaceTombstone(name_space)) + .status(); + if (!status.ok()) { + ICING_LOG(ERROR) << status.error_message() + << "Failed to delete namespace. namespace = " + << name_space; + return status; + } } - ICING_ASSIGN_OR_RETURN(bool updated_existing_document, - UpdateDerivedFilesNamespaceDeleted(name_space)); - if (!updated_existing_document) { + ICING_ASSIGN_OR_RETURN( + num_updated_documents, + BatchDelete(namespace_id, kInvalidSchemaTypeId, soft_delete)); + + if (num_updated_documents <= 0) { // Treat the fact that no existing documents had this namespace to be the // same as this namespace not existing at all. return absl_ports::NotFoundError( absl_ports::StrCat("Namespace '", name_space, "' doesn't exist")); } - return libtextclassifier3::Status::OK; -} -libtextclassifier3::StatusOr<bool> -DocumentStore::UpdateDerivedFilesNamespaceDeleted(std::string_view name_space) { - auto namespace_id_or = namespace_mapper_->Get(name_space); - if (!namespace_id_or.ok()) { - return namespace_id_or.status(); - } - - // Guaranteed to have a NamespaceId now. - NamespaceId namespace_id = namespace_id_or.ValueOrDie(); - - // Tracks if there were any existing documents with this namespace that we - // will mark as deleted. - bool updated_existing_document = false; - - // Traverse FilterCache and delete all docs that match namespace_id - for (DocumentId document_id = 0; document_id < filter_cache_->num_elements(); - ++document_id) { - // filter_cache_->Get can only fail if document_id is < 0 - // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN. - ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data, - filter_cache_->Get(document_id)); - if (data->namespace_id() == namespace_id) { - if (DoesDocumentExist(document_id)) { - updated_existing_document = true; - } - - // docid_mapper_->Set can only fail if document_id is < 0 - // or >= docid_mapper_->num_elements. So the only possible way to get an - // error here would be if filter_cache_->num_elements > - // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN. - ICING_RETURN_IF_ERROR( - document_id_mapper_->Set(document_id, kDocDeletedFlag)); - } - } - - return updated_existing_document; + return libtextclassifier3::Status::OK; } libtextclassifier3::Status DocumentStore::DeleteBySchemaType( - std::string_view schema_type) { + std::string_view schema_type, bool soft_delete) { auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type); if (!schema_type_id_or.ok()) { return absl_ports::Annotate( schema_type_id_or.status(), - absl_ports::StrCat("Failed to delete by schema type. schema_type: ", + absl_ports::StrCat("Failed to find schema type. schema_type: ", schema_type)); } + SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie(); - // Update ground truth first. - // To delete an entire schema type, we append a tombstone that only contains - // the deleted bit and the name of the deleted schema type. - // TODO(b/144458732): Implement a more robust version of - // ICING_RETURN_IF_ERROR that can support error logging. - libtextclassifier3::Status status = - document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type)) - .status(); - if (!status.ok()) { - ICING_LOG(ERROR) << status.error_message() - << "Failed to delete schema_type. schema_type = " - << schema_type; - return status; + int num_updated_documents = 0; + if (soft_delete) { + // To soft-delete an entire schema type, we append a tombstone that only + // contains the deleted bit and the name of the deleted schema type. + // TODO(b/144458732): Implement a more robust version of + // ICING_RETURN_IF_ERROR that can support error logging. + libtextclassifier3::Status status = + document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type)) + .status(); + if (!status.ok()) { + ICING_LOG(ERROR) << status.error_message() + << "Failed to delete schema_type. schema_type = " + << schema_type; + return status; + } } - // Guaranteed to have a SchemaTypeId now - SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie(); + ICING_ASSIGN_OR_RETURN( + num_updated_documents, + BatchDelete(kInvalidNamespaceId, schema_type_id, soft_delete)); - ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(schema_type_id)); + if (num_updated_documents <= 0) { + return absl_ports::NotFoundError(absl_ports::StrCat( + "No documents found with schema type '", schema_type, "'")); + } return libtextclassifier3::Status::OK; } -libtextclassifier3::Status DocumentStore::UpdateDerivedFilesSchemaTypeDeleted( - SchemaTypeId schema_type_id) { - // Traverse FilterCache and delete all docs that match schema_type_id. +libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete( + NamespaceId namespace_id, SchemaTypeId schema_type_id, bool soft_delete) { + // Tracks if there were any existing documents with this namespace that we + // will mark as deleted. + int num_updated_documents = 0; + + // Traverse FilterCache and delete all docs that match namespace_id and + // schema_type_id. for (DocumentId document_id = 0; document_id < filter_cache_->num_elements(); ++document_id) { // filter_cache_->Get can only fail if document_id is < 0 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN. ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data, filter_cache_->Get(document_id)); - if (data->schema_type_id() == schema_type_id) { + + // Check namespace only when the input namespace id is valid. + if (namespace_id != kInvalidNamespaceId && + (data->namespace_id() == kInvalidNamespaceId || + data->namespace_id() != namespace_id)) { + // The document has already been hard-deleted or isn't from the desired + // namespace. + continue; + } + + // Check schema type only when the input schema type id is valid. + if (schema_type_id != kInvalidSchemaTypeId && + (data->schema_type_id() == kInvalidSchemaTypeId || + data->schema_type_id() != schema_type_id)) { + // The document has already been hard-deleted or doesn't have the + // desired schema type. + continue; + } + + // The document has the desired namespace and schema type, it either exists + // or has been soft-deleted / expired. + if (soft_delete) { + if (DoesDocumentExist(document_id)) { + ++num_updated_documents; + } + // docid_mapper_->Set can only fail if document_id is < 0 // or >= docid_mapper_->num_elements. So the only possible way to get an // error here would be if filter_cache_->num_elements > // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN. ICING_RETURN_IF_ERROR( document_id_mapper_->Set(document_id, kDocDeletedFlag)); + } else { + // Hard delete. + libtextclassifier3::Status delete_status = + Delete(document_id, /*soft_delete=*/false); + if (absl_ports::IsNotFound(delete_status)) { + continue; + } else if (!delete_status.ok()) { + // Real error, pass up. + return delete_status; + } + ++num_updated_documents; } } - return libtextclassifier3::Status::OK; + return num_updated_documents; } libtextclassifier3::Status DocumentStore::PersistToDisk() { @@ -1328,5 +1412,26 @@ libtextclassifier3::Status DocumentStore::UpdateFilterCache( return filter_cache_->Set(document_id, filter_data); } +libtextclassifier3::Status DocumentStore::ClearDerivedData( + DocumentId document_id) { + // We intentionally leave the data in key_mapper_ because locating that data + // requires fetching namespace and uri. Leaving data in key_mapper_ should be + // fine because the data is hashed. + + ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag)); + + // Resets the score cache entry + ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache( + document_id, DocumentAssociatedScoreData(/*document_score=*/-1, + /*creation_timestamp_ms=*/-1))); + + // Resets the filter cache entry + ICING_RETURN_IF_ERROR(UpdateFilterCache( + document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId, + /*expiration_timestamp_ms=*/-1))); + + return libtextclassifier3::Status::OK; +} + } // namespace lib } // namespace icing diff --git a/icing/store/document-store.h b/icing/store/document-store.h index 3f4b72f..2ac1c71 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -147,17 +147,41 @@ class DocumentStore { // boolean whether a document exists or not bool DoesDocumentExist(DocumentId document_id) const; - // Deletes the document identified by the given namespace and uri + // Deletes the document identified by the given namespace and uri. The + // document proto will be marked as deleted if 'soft_delete' is true, + // otherwise the document proto will be erased immediately. // - // NOTE: Space is not reclaimed for deleted documents until Optimize() is - // called. + // NOTE: + // 1. The soft deletion uses less CPU power, it can be applied on + // non-sensitive data. + // 2. Space is not reclaimed for deleted documents until Optimize() is + // called. // // Returns: // OK on success // NOT_FOUND if no document exists with namespace, uri // INTERNAL_ERROR on IO error libtextclassifier3::Status Delete(std::string_view name_space, - std::string_view uri); + std::string_view uri, + bool soft_delete = false); + + // Deletes the document identified by the given document_id. The + // document proto will be marked as deleted if 'soft_delete' is true, + // otherwise the document proto will be erased immediately. + // + // NOTE: + // 1. If possible, please use the other method Delete(name_space, uri, + // soft_delete) for soft deletes because we need namespace and uri to + // perform soft deletes. + // 2. Space is not reclaimed for deleted documents until Optimize() is + // called. + // + // Returns: + // OK on success + // INTERNAL_ERROR on IO error + // INVALID_ARGUMENT if document_id is invalid. + libtextclassifier3::Status Delete(DocumentId document_id, + bool soft_delete = false); // Returns the NamespaceId of the string namespace // @@ -180,6 +204,7 @@ class DocumentStore { // DocumentAssociatedScoreData on success // OUT_OF_RANGE if document_id is negative or exceeds previously seen // DocumentIds + // NOT_FOUND if no score data is found libtextclassifier3::StatusOr<DocumentAssociatedScoreData> GetDocumentAssociatedScoreData(DocumentId document_id) const; @@ -194,30 +219,43 @@ class DocumentStore { // DocumentFilterData on success // OUT_OF_RANGE if document_id is negative or exceeds previously seen // DocumentIds + // NOT_FOUND if no filter data is found libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData( DocumentId document_id) const; - // Deletes all documents belonging to the given namespace. + // Deletes all documents belonging to the given namespace. The documents will + // be marked as deleted if 'soft_delete' is true, otherwise they will be + // erased immediately. // - // NOTE: Space is not reclaimed for deleted documents until Optimize() is - // called. + // NOTE: + // 1. The soft deletion uses less CPU power, it can be applied on + // non-sensitive data. + // 2. Space is not reclaimed for deleted documents until Optimize() is + // called. // // Returns: // OK on success // NOT_FOUND if namespace doesn't exist // INTERNAL_ERROR on IO error - libtextclassifier3::Status DeleteByNamespace(std::string_view name_space); + libtextclassifier3::Status DeleteByNamespace(std::string_view name_space, + bool soft_delete = false); - // Deletes all documents belonging to the given schema type + // Deletes all documents belonging to the given schema type. The documents + // will be marked as deleted if 'soft_delete' is true, otherwise they will be + // erased immediately. // - // NOTE: Space is not reclaimed for deleted documents until Optimize() is - // called. + // NOTE: + // 1. The soft deletion uses less CPU power, it can be applied on + // non-sensitive data. + // 2. Space is not reclaimed for deleted documents until Optimize() is + // called. // // Returns: // OK on success // NOT_FOUND if schema_type doesn't exist // INTERNAL_ERROR on IO error - libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type); + libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type, + bool soft_delete = false); // Syncs all the data and metadata changes to disk. // @@ -424,32 +462,42 @@ class DocumentStore { // INTERNAL on I/O error libtextclassifier3::Status UpdateHeader(const Crc32& checksum); - // Update derived files that `name_space` has been deleted. This is primarily - // useful if we're trying to update derived files when we've already seen a - // namespace tombstone, and don't need to write another tombstone. + // Helper function to do batch deletes. Documents with the given + // "namespace_id" and "schema_type_id" will be deleted. If callers don't need + // to specify the namespace or schema type, pass in kInvalidNamespaceId or + // kInvalidSchemaTypeId. The document protos will be marked as deleted if + // 'soft_delete' is true, otherwise the document protos with their derived + // data will be erased / cleared immediately. // // NOTE: Space is not reclaimed in the derived files until Optimize() is // called. // // Returns: - // bool on whether an existing document was actually updated to be deleted + // Number of documents that were actually updated to be deleted // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<bool> UpdateDerivedFilesNamespaceDeleted( - std::string_view name_space); + libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id, + SchemaTypeId schema_type_id, + bool soft_delete); - // Update derived files that the schema type schema_type_id has been deleted. - // This is primarily useful if we're trying to update derived files when we've - // already seen a schema type tombstone, and don't need to write another - // tombstone. + // Marks the document identified by the given name_space, uri and document_id + // as deleted, to be removed later during Optimize(). // - // NOTE: Space is not reclaimed in the derived files until Optimize() is - // called. + // Returns: + // OK on success + // INTERNAL_ERROR on IO error + libtextclassifier3::Status SoftDelete(std::string_view name_space, + std::string_view uri, + DocumentId document_id); + + // Erases the document at the given document_log_offset from the document_log + // and clears the derived data identified by the given document_id. The space + // will be reclaimed later during Optimize(). // // Returns: // OK on success // INTERNAL_ERROR on IO error - libtextclassifier3::Status UpdateDerivedFilesSchemaTypeDeleted( - SchemaTypeId schema_type_id); + libtextclassifier3::Status HardDelete(DocumentId document_id, + uint64_t document_log_offset); // Helper method to find a DocumentId that is associated with the given // namespace and uri. @@ -488,6 +536,9 @@ class DocumentStore { // Updates the entry in the filter cache for document_id. libtextclassifier3::Status UpdateFilterCache( DocumentId document_id, const DocumentFilterData& filter_data); + + // Helper method to clear the derived data of a document + libtextclassifier3::Status ClearDerivedData(DocumentId document_id); }; } // namespace lib diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index ad56b9a..f857481 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -60,9 +60,6 @@ class DocumentStoreTest : public ::testing::Test { : test_dir_(GetTestTempDir() + "/icing"), document_store_dir_(test_dir_ + "/document_store"), schema_store_dir_(test_dir_ + "/schema_store") { - filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); - filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()); - filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); test_document1_ = DocumentBuilder() .SetKey("icing", "email/1") @@ -88,6 +85,11 @@ class DocumentStoreTest : public ::testing::Test { } void SetUp() override { + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); + SchemaProto schema; auto type_config = schema.add_types(); type_config->set_schema_type("email"); @@ -270,7 +272,7 @@ TEST_F(DocumentStoreTest, IsDocumentExisting) { IsFalse()); } -TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) { +TEST_F(DocumentStoreTest, GetSoftDeletedDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocumentStore> document_store, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -281,7 +283,26 @@ TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) { IsOkAndHolds(EqualsProto(test_document1_))); ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), - test_document1_.uri())); + test_document1_.uri(), + /*soft_delete=*/true)); + EXPECT_THAT( + document_store->Get(test_document1_.namespace_(), test_document1_.uri()), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(DocumentStoreTest, GetHardDeletedDocumentNotFound) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> document_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_))); + EXPECT_THAT( + document_store->Get(test_document1_.namespace_(), test_document1_.uri()), + IsOkAndHolds(EqualsProto(test_document1_))); + + ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), + test_document1_.uri(), + /*soft_delete=*/false)); EXPECT_THAT( document_store->Get(test_document1_.namespace_(), test_document1_.uri()), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -343,20 +364,6 @@ TEST_F(DocumentStoreTest, GetInvalidDocumentId) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteOk) { - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocumentStore> doc_store, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - - // Get() after Delete() returns NOT_FOUND - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, - doc_store->Put(DocumentProto(test_document1_))); - EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk()); - EXPECT_THAT(doc_store->Get(document_id), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); -} - TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocumentStore> document_store, @@ -394,7 +401,7 @@ TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceOk) { +TEST_F(DocumentStoreTest, SoftDeleteByNamespaceOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocumentStore> doc_store, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -422,7 +429,8 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceOk) { // DELETE namespace.1. document1 and document 4 should be deleted. document2 // and document3 should still be retrievable. - ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace.1")); + ICING_EXPECT_OK( + doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/true)); EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()), @@ -433,7 +441,67 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceOk) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { +TEST_F(DocumentStoreTest, HardDeleteByNamespaceOk) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> doc_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + + DocumentProto document1 = test_document1_; + document1.set_namespace_("namespace.1"); + document1.set_uri("uri1"); + ICING_ASSERT_OK(doc_store->Put(document1)); + + DocumentProto document2 = test_document1_; + document2.set_namespace_("namespace.2"); + document2.set_uri("uri1"); + ICING_ASSERT_OK(doc_store->Put(document2)); + + DocumentProto document3 = test_document1_; + document3.set_namespace_("namespace.3"); + document3.set_uri("uri1"); + ICING_ASSERT_OK(doc_store->Put(document3)); + + DocumentProto document4 = test_document1_; + document4.set_namespace_("namespace.1"); + document4.set_uri("uri2"); + ICING_ASSERT_OK(doc_store->Put(document4)); + + // DELETE namespace.1. document1 and document 4 should be deleted. document2 + // and document3 should still be retrievable. + ICING_EXPECT_OK( + doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/false)); + EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()), + IsOkAndHolds(EqualsProto(document2))); + EXPECT_THAT(doc_store->Get(document3.namespace_(), document3.uri()), + IsOkAndHolds(EqualsProto(document3))); + EXPECT_THAT(doc_store->Get(document4.namespace_(), document4.uri()), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNonexistentNamespaceNotFound) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> doc_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + + // Validates that deleting something non-existing won't append anything to + // ground truth + int64_t ground_truth_size_before = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + + EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace", + /*soft_delete=*/true), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + int64_t ground_truth_size_after = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); +} + +TEST_F(DocumentStoreTest, HardDeleteByNamespaceNonexistentNamespaceNotFound) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocumentStore> doc_store, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -444,7 +512,8 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { int64_t ground_truth_size_before = filesystem_.GetFileSize( absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace"), + EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace", + /*soft_delete=*/false), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); int64_t ground_truth_size_after = filesystem_.GetFileSize( @@ -452,7 +521,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); } -TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { +TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNoExistingDocumentsNotFound) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocumentStore> document_store, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -464,7 +533,25 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { // At this point, there are no existing documents with the namespace, even // though Icing's derived files know about this namespace. We should still // return NOT_FOUND since nothing existing has this namespace. - EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_()), + EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_(), + /*soft_delete=*/true), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(DocumentStoreTest, HardDeleteByNamespaceNoExistingDocumentsNotFound) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> document_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + ICING_EXPECT_OK(document_store->Put(test_document1_)); + ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), + test_document1_.uri())); + + // At this point, there are no existing documents with the namespace, even + // though Icing's derived files know about this namespace. We should still + // return NOT_FOUND since nothing existing has this namespace. + EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_(), + /*soft_delete=*/false), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } @@ -536,7 +623,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { +TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeOk) { SchemaProto schema; auto type_config = schema.add_types(); type_config->set_schema_type("email"); @@ -593,7 +680,8 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { // Delete the "email" type and ensure that it works across both // email_document's namespaces. And that other documents aren't affected. - ICING_EXPECT_OK(document_store->DeleteBySchemaType("email")); + ICING_EXPECT_OK( + document_store->DeleteBySchemaType("email", /*soft_delete=*/true)); EXPECT_THAT(document_store->Get(email_1_document_id), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(document_store->Get(email_2_document_id), @@ -604,7 +692,8 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { IsOkAndHolds(EqualsProto(person_document))); // Delete the "message" type and check that other documents aren't affected - ICING_EXPECT_OK(document_store->DeleteBySchemaType("message")); + ICING_EXPECT_OK( + document_store->DeleteBySchemaType("message", /*soft_delete=*/true)); EXPECT_THAT(document_store->Get(email_1_document_id), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(document_store->Get(email_2_document_id), @@ -615,7 +704,109 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { IsOkAndHolds(EqualsProto(person_document))); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { +TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) { + SchemaProto schema; + auto type_config = schema.add_types(); + type_config->set_schema_type("email"); + type_config = schema.add_types(); + type_config->set_schema_type("message"); + type_config = schema.add_types(); + type_config->set_schema_type("person"); + + std::string schema_store_dir = schema_store_dir_ + "_custom"; + filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir)); + + ICING_ASSERT_OK(schema_store->SetSchema(schema)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> document_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + + DocumentProto email_document_1 = DocumentBuilder() + .SetKey("namespace1", "1") + .SetSchema("email") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_1_document_id, + document_store->Put(email_document_1)); + + DocumentProto email_document_2 = DocumentBuilder() + .SetKey("namespace2", "2") + .SetSchema("email") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_2_document_id, + document_store->Put(email_document_2)); + + DocumentProto message_document = DocumentBuilder() + .SetKey("namespace", "3") + .SetSchema("message") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id, + document_store->Put(message_document)); + + DocumentProto person_document = DocumentBuilder() + .SetKey("namespace", "4") + .SetSchema("person") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id, + document_store->Put(person_document)); + + // Delete the "email" type and ensure that it works across both + // email_document's namespaces. And that other documents aren't affected. + ICING_EXPECT_OK( + document_store->DeleteBySchemaType("email", /*soft_delete=*/false)); + EXPECT_THAT(document_store->Get(email_1_document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(email_2_document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(message_document_id), + IsOkAndHolds(EqualsProto(message_document))); + EXPECT_THAT(document_store->Get(person_document_id), + IsOkAndHolds(EqualsProto(person_document))); + + // Delete the "message" type and check that other documents aren't affected + ICING_EXPECT_OK( + document_store->DeleteBySchemaType("message", /*soft_delete=*/false)); + EXPECT_THAT(document_store->Get(email_1_document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(email_2_document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(message_document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(person_document_id), + IsOkAndHolds(EqualsProto(person_document))); +} + +TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNonexistentSchemaTypeNotFound) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> document_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + + // Validates that deleting something non-existing won't append anything to + // ground truth + int64_t ground_truth_size_before = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + + EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type", + /*soft_delete=*/true), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + int64_t ground_truth_size_after = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + + EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); +} + +TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNonexistentSchemaTypeNotFound) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocumentStore> document_store, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -626,7 +817,8 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { int64_t ground_truth_size_before = filesystem_.GetFileSize( absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type"), + EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type", + /*soft_delete=*/false), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); int64_t ground_truth_size_after = filesystem_.GetFileSize( @@ -635,7 +827,21 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); } -TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsOk) { +TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNoExistingDocumentsNotFound) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> document_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + ICING_EXPECT_OK(document_store->Put(test_document1_)); + ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), + test_document1_.uri())); + + EXPECT_THAT(document_store->DeleteBySchemaType(test_document1_.schema(), + /*soft_delete=*/true), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNoExistingDocumentsNotFound) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocumentStore> document_store, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -644,10 +850,9 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsOk) { ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), test_document1_.uri())); - // At this point, there are no existing documents with the schema type, but we - // still return OK because the SchemaStore is the ground truth on schemas and - // knows about the type - ICING_EXPECT_OK(document_store->DeleteBySchemaType(test_document1_.schema())); + EXPECT_THAT(document_store->DeleteBySchemaType(test_document1_.schema(), + /*soft_delete=*/false), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { @@ -1177,7 +1382,7 @@ TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, FilterCacheHoldsDeletedDocumentData) { +TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocumentStore> doc_store, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1193,14 +1398,71 @@ TEST_F(DocumentStoreTest, FilterCacheHoldsDeletedDocumentData) { /*schema_type_id=*/0, /*expiration_timestamp_ms=*/document1_expiration_timestamp_))); - // FilterCache doesn't care if the document has been deleted - ICING_ASSERT_OK(doc_store->Delete("icing", "email/1")); + ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true)); + // Associated entry of the deleted document is removed. + EXPECT_THAT(doc_store->GetDocumentFilterData(document_id).status(), IsOk()); +} + +TEST_F(DocumentStoreTest, HardDeleteClearsFilterCache) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> doc_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + doc_store->Put(test_document1_)); + EXPECT_THAT( doc_store->GetDocumentFilterData(document_id), IsOkAndHolds(DocumentFilterData( /*namespace_id=*/0, /*schema_type_id=*/0, /*expiration_timestamp_ms=*/document1_expiration_timestamp_))); + + ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false)); + // Associated entry of the deleted document is removed. + EXPECT_THAT(doc_store->GetDocumentFilterData(document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearScoreCache) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> doc_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + doc_store->Put(test_document1_)); + + EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id), + IsOkAndHolds(DocumentAssociatedScoreData( + /*document_score=*/document1_score_, + /*creation_timestamp_ms=*/document1_creation_timestamp_))); + + ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true)); + // Associated entry of the deleted document is removed. + EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id).status(), + IsOk()); +} + +TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocumentStore> doc_store, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + doc_store->Put(test_document1_)); + + EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id), + IsOkAndHolds(DocumentAssociatedScoreData( + /*document_score=*/document1_score_, + /*creation_timestamp_ms=*/document1_creation_timestamp_))); + + ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false)); + // Associated entry of the deleted document is removed. + EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } TEST_F(DocumentStoreTest, diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h index 4571df2..23c7b69 100644 --- a/icing/store/key-mapper.h +++ b/icing/store/key-mapper.h @@ -84,6 +84,9 @@ class KeyMapper { // Returns any encountered IO errors. libtextclassifier3::StatusOr<T> Get(std::string_view key) const; + // Deletes data related to the given key. Returns true on success. + bool Delete(std::string_view key); + // Returns a map of values to keys. Empty map if the mapper is empty. std::unordered_map<T, std::string> GetValuesToKeys() const; @@ -255,6 +258,11 @@ libtextclassifier3::StatusOr<T> KeyMapper<T>::Get(std::string_view key) const { } template <typename T> +bool KeyMapper<T>::Delete(std::string_view key) { + return trie_.Delete(key); +} + +template <typename T> std::unordered_map<T, std::string> KeyMapper<T>::GetValuesToKeys() const { std::unordered_map<T, std::string> values_to_keys; for (IcingDynamicTrie::Iterator itr(trie_, /*prefix=*/""); itr.IsValid(); diff --git a/icing/store/namespace-id.h b/icing/store/namespace-id.h index 4225be3..374e7a8 100644 --- a/icing/store/namespace-id.h +++ b/icing/store/namespace-id.h @@ -22,6 +22,7 @@ namespace lib { // Id of unique namespace in DocumentProto. Generated in DocumentStore. using NamespaceId = int16_t; +inline constexpr NamespaceId kInvalidNamespaceId = -1; } // namespace lib } // namespace icing diff --git a/icing/store/usage-store.cc b/icing/store/usage-store.cc new file mode 100644 index 0000000..911c45a --- /dev/null +++ b/icing/store/usage-store.cc @@ -0,0 +1,193 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/store/usage-store.h" + +#include "icing/file/file-backed-vector.h" +#include "icing/proto/usage.pb.h" +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { + +namespace { +std::string MakeUsageScoreCacheFilename(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/usage-scores"); +} +} // namespace + +libtextclassifier3::StatusOr<std::unique_ptr<UsageStore>> UsageStore::Create( + const Filesystem* filesystem, const std::string& base_dir) { + ICING_RETURN_ERROR_IF_NULL(filesystem); + + auto usage_score_cache_or = FileBackedVector<UsageScores>::Create( + *filesystem, MakeUsageScoreCacheFilename(base_dir), + MemoryMappedFile::READ_WRITE_AUTO_SYNC); + + if (!usage_score_cache_or.ok()) { + ICING_LOG(ERROR) << usage_score_cache_or.status().error_message() + << "Failed to initialize usage_score_cache"; + return usage_score_cache_or.status(); + } + + return std::unique_ptr<UsageStore>(new UsageStore( + std::move(usage_score_cache_or).ValueOrDie(), *filesystem, base_dir)); +} + +libtextclassifier3::Status UsageStore::AddUsageReport(const UsageReport& report, + DocumentId document_id) { + if (!IsDocumentIdValid(document_id)) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Document id %d is invalid.", document_id)); + } + + auto usage_scores_or = usage_score_cache_->Get(document_id); + + // OutOfRange means that the mapper hasn't seen this document id before, it's + // not an error here. + UsageScores usage_scores; + if (usage_scores_or.ok()) { + usage_scores = *std::move(usage_scores_or).ValueOrDie(); + } else if (!absl_ports::IsOutOfRange(usage_scores_or.status())) { + // Real error + return usage_scores_or.status(); + } + + // Update last used timestamps and type counts. The counts won't be + // incremented if they are already the maximum values. The timestamp from + // UsageReport is in milliseconds, we need to convert it to seconds. + int64_t report_timestamp_s = report.usage_timestamp_ms() / 1000; + + switch (report.usage_type()) { + case UsageReport::USAGE_TYPE1: + if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) { + usage_scores.usage_type1_last_used_timestamp_s = + std::numeric_limits<uint32_t>::max(); + } else if (report_timestamp_s > + usage_scores.usage_type1_last_used_timestamp_s) { + usage_scores.usage_type1_last_used_timestamp_s = report_timestamp_s; + } + + if (usage_scores.usage_type1_count < std::numeric_limits<int>::max()) { + ++usage_scores.usage_type1_count; + } + break; + case UsageReport::USAGE_TYPE2: + if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) { + usage_scores.usage_type2_last_used_timestamp_s = + std::numeric_limits<uint32_t>::max(); + } else if (report_timestamp_s > + usage_scores.usage_type2_last_used_timestamp_s) { + usage_scores.usage_type2_last_used_timestamp_s = report_timestamp_s; + } + + if (usage_scores.usage_type2_count < std::numeric_limits<int>::max()) { + ++usage_scores.usage_type2_count; + } + break; + case UsageReport::USAGE_TYPE3: + if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) { + usage_scores.usage_type3_last_used_timestamp_s = + std::numeric_limits<uint32_t>::max(); + } else if (report_timestamp_s > + usage_scores.usage_type3_last_used_timestamp_s) { + usage_scores.usage_type3_last_used_timestamp_s = report_timestamp_s; + } + + if (usage_scores.usage_type3_count < std::numeric_limits<int>::max()) { + ++usage_scores.usage_type3_count; + } + } + + // Write updated usage scores to file. + ICING_RETURN_IF_ERROR(usage_score_cache_->Set(document_id, usage_scores)); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status UsageStore::DeleteUsageScores( + DocumentId document_id) { + if (!IsDocumentIdValid(document_id)) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Document id %d is invalid.", document_id)); + } + + // Clear all the scores of the document. + ICING_RETURN_IF_ERROR(usage_score_cache_->Set(document_id, UsageScores())); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr<UsageStore::UsageScores> +UsageStore::GetUsageScores(DocumentId document_id) { + if (!IsDocumentIdValid(document_id)) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Document id %d is invalid.", document_id)); + } + + auto usage_scores_or = usage_score_cache_->Get(document_id); + if (absl_ports::IsOutOfRange(usage_scores_or.status())) { + // No usage scores found. Return the default scores. + return UsageScores(); + } else if (!usage_scores_or.ok()) { + // Pass up any other errors. + return usage_scores_or.status(); + } + + return *std::move(usage_scores_or).ValueOrDie(); +} + +libtextclassifier3::Status UsageStore::SetUsageScores( + DocumentId document_id, UsageScores usage_scores) { + if (!IsDocumentIdValid(document_id)) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Document id %d is invalid.", document_id)); + } + + ICING_RETURN_IF_ERROR(usage_score_cache_->Set(document_id, usage_scores)); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status UsageStore::PersistToDisk() { + ICING_RETURN_IF_ERROR(usage_score_cache_->PersistToDisk()); + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status UsageStore::Reset() { + // We delete all the scores by deleting the whole file. + libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete( + filesystem_, MakeUsageScoreCacheFilename(base_dir_)); + if (!status.ok()) { + ICING_LOG(ERROR) << status.error_message() + << "Failed to delete usage_score_cache"; + return status; + } + + // Create a new usage_score_cache + auto usage_score_cache_or = FileBackedVector<UsageScores>::Create( + filesystem_, MakeUsageScoreCacheFilename(base_dir_), + MemoryMappedFile::READ_WRITE_AUTO_SYNC); + if (!usage_score_cache_or.ok()) { + ICING_LOG(ERROR) << usage_score_cache_or.status().error_message() + << "Failed to re-create usage_score_cache"; + return usage_score_cache_or.status(); + } + usage_score_cache_ = std::move(usage_score_cache_or).ValueOrDie(); + + return libtextclassifier3::Status::OK; +} + +} // namespace lib +} // namespace icing diff --git a/icing/store/usage-store.h b/icing/store/usage-store.h new file mode 100644 index 0000000..9a8c286 --- /dev/null +++ b/icing/store/usage-store.h @@ -0,0 +1,160 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cstdint> + +#include "icing/file/file-backed-vector.h" +#include "icing/proto/usage.pb.h" +#include "icing/store/document-id.h" + +#ifndef ICING_STORE_USAGE_STORE_H_ +#define ICING_STORE_USAGE_STORE_H_ + +namespace icing { +namespace lib { + +// A storage class that maintains scores that are calculated based on usage +// reports. +class UsageStore { + public: + // Factory function to create a UsageStore instance. The base directory is + // used to persist usage scores. If a usage store was previously created with + // this directory, it will reload the files saved by the last instance. + // + // TODO(b/169594617): consider returning StatusOr<UsageStore> + // + // Returns: + // A UsageStore on success + // FAILED_PRECONDITION on any null pointer input + // INTERNAL_ERROR on I/O error + static libtextclassifier3::StatusOr<std::unique_ptr<UsageStore>> Create( + const Filesystem* filesystem, const std::string& base_dir); + + // The scores here reflect the timestamps and usage types defined in + // usage.proto. + struct UsageScores { + // The latest timestamp in seconds reported with custom usage type 1. + uint32_t usage_type1_last_used_timestamp_s = 0; + + // The latest timestamp in seconds reported with custom usage type 2. + uint32_t usage_type2_last_used_timestamp_s = 0; + + // The latest timestamp in seconds reported with custom usage type 3. + uint32_t usage_type3_last_used_timestamp_s = 0; + + // Count of reports with custom usage type 1 + int usage_type1_count = 0; + + // Count of reports with custom usage type 2 + int usage_type2_count = 0; + + // Count of reports with custom usage type 3 + int usage_type3_count = 0; + + bool operator==(const UsageScores& other) const { + return usage_type1_last_used_timestamp_s == + other.usage_type1_last_used_timestamp_s && + usage_type2_last_used_timestamp_s == + other.usage_type2_last_used_timestamp_s && + usage_type3_last_used_timestamp_s == + other.usage_type3_last_used_timestamp_s && + usage_type1_count == other.usage_type1_count && + usage_type2_count == other.usage_type2_count && + usage_type3_count == other.usage_type3_count; + } + }; + + // Adds one usage report. The corresponding usage scores of the specified + // document will be updated. + // + // Note: changes are written to disk automatically, callers can also call + // PersistToDisk() to flush changes immediately. + // + // Returns: + // OK on success + // INVALID_ARGUMENT if document_id is invalid + // INTERNAL_ERROR on I/O errors. + libtextclassifier3::Status AddUsageReport(const UsageReport& report, + DocumentId document_id); + + // Deletes the usage scores of a document. + // + // Note: changes are written to disk automatically, callers can also call + // PersistToDisk() to flush changes immediately. + // + // Returns: + // OK on success + // INVALID_ARGUMENT if document_id is invalid + // INTERNAL_ERROR on I/O errors + libtextclassifier3::Status DeleteUsageScores(DocumentId document_id); + + // Gets the usage scores of a document. + // + // Returns: + // UsageScores on success + // INVALID_ARGUMENT if document_id is invalid + // NOT_FOUND if no scores are found for the document + // INTERNAL_ERROR on I/O errors + // + // TODO(b/169433395): return a pointer instead of an object. + libtextclassifier3::StatusOr<UsageScores> GetUsageScores( + DocumentId document_id); + + // Sets the usage scores of a document. + // + // Note: changes are written to disk automatically, callers can also call + // PersistToDisk() to flush changes immediately. + // + // Returns: + // OK on success + // INVALID_ARGUMENT if document_id is invalid + // INTERNAL_ERROR on I/O errors + libtextclassifier3::Status SetUsageScores(DocumentId document_id, + UsageScores usage_scores); + + // Syncs data to disk. + // + // Returns: + // OK on success + // INTERNAL on I/O error + libtextclassifier3::Status PersistToDisk(); + + // Deletes all usage data and re-initialize the storage. + // + // Returns: + // OK on success + // INTERNAL_ERROR on I/O error + libtextclassifier3::Status Reset(); + + private: + explicit UsageStore(std::unique_ptr<FileBackedVector<UsageScores>> + document_id_to_scores_mapper, + const Filesystem& filesystem, std::string base_dir) + : filesystem_(filesystem), + base_dir_(std::move(base_dir)), + usage_score_cache_(std::move(document_id_to_scores_mapper)) {} + + const Filesystem& filesystem_; + + // Base directory where the files are located. + const std::string base_dir_; + + // Used to store the usage scores of documents. + std::unique_ptr<FileBackedVector<UsageScores>> usage_score_cache_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_STORE_USAGE_STORE_H_ diff --git a/icing/store/usage-store_test.cc b/icing/store/usage-store_test.cc new file mode 100644 index 0000000..39985f0 --- /dev/null +++ b/icing/store/usage-store_test.cc @@ -0,0 +1,389 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/store/usage-store.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { +using ::testing::Eq; +using ::testing::Not; + +class UsageStoreTest : public testing::Test { + protected: + UsageStoreTest() : test_dir_(GetTestTempDir() + "/usage-store-test") {} + + void SetUp() override { + filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); + } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + } + + const Filesystem filesystem_; + const std::string test_dir_; +}; + +UsageReport CreateUsageReport(std::string name_space, std::string uri, + int64 timestamp_ms, + UsageReport::UsageType usage_type) { + UsageReport usage_report; + usage_report.set_document_namespace(name_space); + usage_report.set_document_uri(uri); + usage_report.set_usage_timestamp_ms(timestamp_ms); + usage_report.set_usage_type(usage_type); + return usage_report; +} + +TEST_F(UsageStoreTest, CreationShouldSucceed) { + EXPECT_THAT(UsageStore::Create(&filesystem_, test_dir_), IsOk()); +} + +TEST_F(UsageStoreTest, CreationShouldFailOnNullPointer) { + EXPECT_THAT(UsageStore::Create(nullptr, test_dir_), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); +} + +TEST_F(UsageStoreTest, UsageScoresShouldBeComparable) { + UsageStore::UsageScores scores1; + UsageStore::UsageScores scores2; + EXPECT_THAT(scores1, Eq(scores2)); + + // operator== should compare usage_type1_last_used_timestamp_s. + ++scores1.usage_type1_last_used_timestamp_s; + EXPECT_THAT(scores1, Not(Eq(scores2))); + ++scores2.usage_type1_last_used_timestamp_s; + EXPECT_THAT(scores1, Eq(scores2)); + + // operator== should compare usage_type2_last_used_timestamp_s. + ++scores1.usage_type2_last_used_timestamp_s; + EXPECT_THAT(scores1, Not(Eq(scores2))); + ++scores2.usage_type2_last_used_timestamp_s; + EXPECT_THAT(scores1, Eq(scores2)); + + // operator== should compare usage_type3_last_used_timestamp_s. + ++scores1.usage_type3_last_used_timestamp_s; + EXPECT_THAT(scores1, Not(Eq(scores2))); + ++scores2.usage_type3_last_used_timestamp_s; + EXPECT_THAT(scores1, Eq(scores2)); + + // operator== should compare usage_type1_count. + ++scores1.usage_type1_count; + EXPECT_THAT(scores1, Not(Eq(scores2))); + ++scores2.usage_type1_count; + EXPECT_THAT(scores1, Eq(scores2)); + + // operator== should compare usage_type2_count. + ++scores1.usage_type2_count; + EXPECT_THAT(scores1, Not(Eq(scores2))); + ++scores2.usage_type2_count; + EXPECT_THAT(scores1, Eq(scores2)); + + // operator== should compare usage_type3_count. + ++scores1.usage_type3_count; + EXPECT_THAT(scores1, Not(Eq(scores2))); + ++scores2.usage_type3_count; + EXPECT_THAT(scores1, Eq(scores2)); +} + +TEST_F(UsageStoreTest, InvalidDocumentIdShouldReturnError) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + DocumentId invalid_document_id = -1; + + EXPECT_THAT(usage_store->AddUsageReport(UsageReport(), invalid_document_id), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + EXPECT_THAT(usage_store->DeleteUsageScores(invalid_document_id), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + EXPECT_THAT(usage_store->GetUsageScores(invalid_document_id), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + EXPECT_THAT(usage_store->SetUsageScores(invalid_document_id, + UsageStore::UsageScores()), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(UsageStoreTest, AddUsageReportShouldUpdateLastUsedTimestamp) { + // Create 3 reports with different timestamps. + UsageReport usage_report_time1 = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1); + UsageReport usage_report_time5 = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/5000, UsageReport::USAGE_TYPE1); + UsageReport usage_report_time10 = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/10000, UsageReport::USAGE_TYPE1); + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // Report a usage with timestamp 5. + usage_store->AddUsageReport(usage_report_time5, /*document_id=*/1); + UsageStore::UsageScores expected_scores; + expected_scores.usage_type1_last_used_timestamp_s = 5; + expected_scores.usage_type1_count = 1; + expected_scores.usage_type2_count = 0; + expected_scores.usage_type3_count = 0; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); + + // Report a usage with timestamp 1. The timestamp won't be updated. + usage_store->AddUsageReport(usage_report_time1, /*document_id=*/1); + ++expected_scores.usage_type1_count; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); + + // Report a usage with timestamp 10. The timestamp should be updated. + usage_store->AddUsageReport(usage_report_time10, /*document_id=*/1); + expected_scores.usage_type1_last_used_timestamp_s = 10; + ++expected_scores.usage_type1_count; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); +} + +TEST_F(UsageStoreTest, AddUsageReportShouldUpdateCounts) { + // Create 3 reports with different usage types. + UsageReport usage_report_type1 = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1); + UsageReport usage_report_type2 = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE2); + UsageReport usage_report_type3 = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE3); + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // Report a usage with type 1. + usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1); + UsageStore::UsageScores expected_scores; + expected_scores.usage_type1_count = 1; + expected_scores.usage_type2_count = 0; + expected_scores.usage_type3_count = 0; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); + // Report another usage with type 1. + usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1); + ++expected_scores.usage_type1_count; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); + + // Report a usage with type 2. + usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1); + ++expected_scores.usage_type2_count; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); + // Report another usage with type 2. + usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1); + ++expected_scores.usage_type2_count; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); + + // Report a usage with type 3. + usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1); + ++expected_scores.usage_type3_count; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); + // Report another usage with type 3. + usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1); + ++expected_scores.usage_type3_count; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); +} + +TEST_F(UsageStoreTest, GetNonExistingDocumentShouldReturnDefaultScores) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(UsageStore::UsageScores())); +} + +TEST_F(UsageStoreTest, SetAndGetUsageScores) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // Create usage scores with some random numbers. + UsageStore::UsageScores scores; + scores.usage_type1_last_used_timestamp_s = 7; + scores.usage_type2_last_used_timestamp_s = 9; + scores.usage_type3_last_used_timestamp_s = 11; + scores.usage_type1_count = 3; + scores.usage_type2_count = 4; + scores.usage_type3_count = 9; + + // Verify that set and get results are consistent. + ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores)); + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(scores)); +} + +TEST_F(UsageStoreTest, ImplicitlyInitializedScoresShouldBeZero) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // Explicitly set scores for document 2. + ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/2, + UsageStore::UsageScores())); + + // Now the scores of document 1 have been implicitly initialized. The scores + // should all be 0. + UsageStore::UsageScores expected_scores; + expected_scores.usage_type1_last_used_timestamp_s = 0; + expected_scores.usage_type2_last_used_timestamp_s = 0; + expected_scores.usage_type3_last_used_timestamp_s = 0; + expected_scores.usage_type1_count = 0; + expected_scores.usage_type2_count = 0; + expected_scores.usage_type3_count = 0; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); +} + +TEST_F(UsageStoreTest, DeleteUsageScores) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // Create usage scores with some random numbers. + UsageStore::UsageScores scores; + scores.usage_type1_last_used_timestamp_s = 7; + scores.usage_type2_last_used_timestamp_s = 9; + scores.usage_type3_last_used_timestamp_s = 11; + scores.usage_type1_count = 3; + scores.usage_type2_count = 4; + scores.usage_type3_count = 9; + ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores)); + + // Delete the usage scores of document 1, all the scores of document 1 should + // be 0. + UsageStore::UsageScores expected_scores; + expected_scores.usage_type1_last_used_timestamp_s = 0; + expected_scores.usage_type2_last_used_timestamp_s = 0; + expected_scores.usage_type3_last_used_timestamp_s = 0; + expected_scores.usage_type1_count = 0; + expected_scores.usage_type2_count = 0; + expected_scores.usage_type3_count = 0; + ICING_EXPECT_OK(usage_store->DeleteUsageScores(/*document_id=*/1)); + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); +} + +TEST_F(UsageStoreTest, PersistToDisk) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // Create usage scores with some random numbers. + UsageStore::UsageScores scores; + scores.usage_type1_last_used_timestamp_s = 7; + scores.usage_type2_last_used_timestamp_s = 9; + scores.usage_type3_last_used_timestamp_s = 11; + scores.usage_type1_count = 3; + scores.usage_type2_count = 4; + scores.usage_type3_count = 9; + ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores)); + + EXPECT_THAT(usage_store->PersistToDisk(), IsOk()); +} + +TEST_F(UsageStoreTest, Reset) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // Create usage scores with some random numbers. + UsageStore::UsageScores scores; + scores.usage_type1_last_used_timestamp_s = 7; + scores.usage_type2_last_used_timestamp_s = 9; + scores.usage_type3_last_used_timestamp_s = 11; + scores.usage_type1_count = 3; + scores.usage_type2_count = 4; + scores.usage_type3_count = 9; + + // Set scores for document 1 and document 2. + ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores)); + ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/2, scores)); + + EXPECT_THAT(usage_store->Reset(), IsOk()); + + // After resetting, all the scores are cleared. + UsageStore::UsageScores expected_scores; + expected_scores.usage_type1_last_used_timestamp_s = 0; + expected_scores.usage_type2_last_used_timestamp_s = 0; + expected_scores.usage_type3_last_used_timestamp_s = 0; + expected_scores.usage_type1_count = 0; + expected_scores.usage_type2_count = 0; + expected_scores.usage_type3_count = 0; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/2), + IsOkAndHolds(expected_scores)); +} + +TEST_F(UsageStoreTest, TimestampInSecondsShouldNotOverflow) { + // Create a report with the max value of timestamps. + UsageReport usage_report = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/std::numeric_limits<int64>::max(), + UsageReport::USAGE_TYPE1); + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // The stored timestamp in seconds should be the max value of uint32. + usage_store->AddUsageReport(usage_report, /*document_id=*/1); + UsageStore::UsageScores expected_scores; + expected_scores.usage_type1_last_used_timestamp_s = + std::numeric_limits<uint32_t>::max(); + expected_scores.usage_type1_count = 1; + expected_scores.usage_type2_count = 0; + expected_scores.usage_type3_count = 0; + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(expected_scores)); +} + +TEST_F(UsageStoreTest, CountsShouldNotOverflow) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // Create usage scores with the max value of int. + UsageStore::UsageScores scores; + scores.usage_type1_last_used_timestamp_s = 0; + scores.usage_type2_last_used_timestamp_s = 0; + scores.usage_type3_last_used_timestamp_s = 0; + scores.usage_type1_count = std::numeric_limits<int>::max(); + scores.usage_type2_count = 0; + scores.usage_type3_count = 0; + + ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores)); + ASSERT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(scores)); + + // Report another usage with type 1. + UsageReport usage_report = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1); + usage_store->AddUsageReport(usage_report, /*document_id=*/1); + + // usage_type1_count should not change because it's already the max value. + EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1), + IsOkAndHolds(scores)); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/tokenization/icu/icu-language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc index 0ef1824..9213fbe 100644 --- a/icing/tokenization/icu/icu-language-segmenter-factory.cc +++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc @@ -15,6 +15,7 @@ #include "icing/tokenization/icu/icu-language-segmenter.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/util/logging.h" +#include "unicode/uloc.h" namespace icing { namespace lib { diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index 31c2726..d0b90d1 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -409,6 +409,71 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) { EXPECT_THAT(word2_address, Eq(word2_result_address)); } +TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("How")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + ASSERT_TRUE(itr->Advance()); // itr points to 'How' + EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("How")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, + IteratorMultipleAdvancesResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + ASSERT_TRUE(itr->Advance()); + ASSERT_TRUE(itr->Advance()); + ASSERT_TRUE(itr->Advance()); + ASSERT_TRUE(itr->Advance()); // itr points to ' ' + EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("How")); +} + +TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, + language_segmenter_factory::Create(GetOptions())); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + while (itr->Advance()) { + // Do nothing. + } + EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("How")); +} + TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) { ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, language_segmenter_factory::Create(GetOptions())); @@ -992,6 +1057,19 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) { EXPECT_THAT(itr->GetTerm(), Eq("ไป")); } +TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) { + ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, + language_segmenter_factory::Create(GetOptions())); + // Validates that the input strings are not copied + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<std::string_view> terms, + language_segmenter->GetAllTerms( + "(-term1 OR term2) AND property1.subproperty2:term3")); + EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2", + ")", " ", "AND", " ", "property1", ".", + "subproperty2", ":", "term3")); +} + INSTANTIATE_TEST_SUITE_P( LocaleName, IcuLanguageSegmenterAllLocalesTest, testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH, diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h index ce50d0b..e60c168 100644 --- a/icing/tokenization/language-segmenter-factory.h +++ b/icing/tokenization/language-segmenter-factory.h @@ -18,11 +18,14 @@ #include <memory> #include <string_view> +#ifdef __ANDROID__ #include "icing/jni/jni-cache.h" +#else // __ANDROID__ +class JniCache; // forward declaration to let non-Android builds work. +#endif // __ANDROID__ + #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/tokenization/language-segmenter.h" -#include "icing/util/i18n-utils.h" -#include "unicode/uloc.h" namespace icing { namespace lib { @@ -30,7 +33,7 @@ namespace lib { namespace language_segmenter_factory { struct SegmenterOptions { - explicit SegmenterOptions(std::string locale = ULOC_US, + explicit SegmenterOptions(std::string locale, const JniCache* jni_cache = nullptr) : locale(std::move(locale)), jni_cache(jni_cache) {} @@ -46,7 +49,7 @@ struct SegmenterOptions { // A LanguageSegmenter on success // INVALID_ARGUMENT if locale string is invalid libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create( - SegmenterOptions options = SegmenterOptions()); + SegmenterOptions options); } // namespace language_segmenter_factory diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc index c7b068d..a1b031a 100644 --- a/icing/tokenization/language-segmenter-iterator_test.cc +++ b/icing/tokenization/language-segmenter-iterator_test.cc @@ -43,8 +43,10 @@ class LanguageSegmenterIteratorTest : public testing::Test { }; TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -62,8 +64,10 @@ TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) { TEST_F(LanguageSegmenterIteratorTest, ResetToTermStartingAfterWithOffsetInText) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -77,8 +81,10 @@ TEST_F(LanguageSegmenterIteratorTest, TEST_F(LanguageSegmenterIteratorTest, ResetToTermStartingAfterWithNegativeOffsetNotOk) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -95,8 +101,10 @@ TEST_F(LanguageSegmenterIteratorTest, TEST_F(LanguageSegmenterIteratorTest, ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) { std::string text = "foo bar"; - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()), @@ -106,8 +114,10 @@ TEST_F(LanguageSegmenterIteratorTest, TEST_F(LanguageSegmenterIteratorTest, ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) { std::string text = "foo bar"; - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100), @@ -115,8 +125,10 @@ TEST_F(LanguageSegmenterIteratorTest, } TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -130,8 +142,10 @@ TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) { TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithZeroNotFound) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -142,8 +156,10 @@ TEST_F(LanguageSegmenterIteratorTest, TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); @@ -157,8 +173,10 @@ TEST_F(LanguageSegmenterIteratorTest, TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) { std::string text = "foo bar"; - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()), diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc index 49ddfca..bd86169 100644 --- a/icing/tokenization/language-segmenter_benchmark.cc +++ b/icing/tokenization/language-segmenter_benchmark.cc @@ -20,6 +20,7 @@ #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer.h" +#include "unicode/uloc.h" // Run on a Linux workstation: // $ blaze build -c opt --dynamic_mode=off --copt=-gmlt @@ -59,8 +60,9 @@ void BM_SegmentNoSpace(benchmark::State& state) { GetTestFilePath("icing/icu.dat"))); } + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::string input_string(state.range(0), 'A'); @@ -95,8 +97,9 @@ void BM_SegmentWithSpaces(benchmark::State& state) { GetTestFilePath("icing/icu.dat"))); } + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::string input_string(state.range(0), 'A'); for (int i = 1; i < input_string.length(); i += 2) { @@ -134,8 +137,9 @@ void BM_SegmentCJK(benchmark::State& state) { GetTestFilePath("icing/icu.dat"))); } + language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = - language_segmenter_factory::Create().ValueOrDie(); + language_segmenter_factory::Create(std::move(options)).ValueOrDie(); std::string input_string; while (input_string.length() < state.range(0)) { diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc index f2fc678..d9db75a 100644 --- a/icing/tokenization/plain-tokenizer_test.cc +++ b/icing/tokenization/plain-tokenizer_test.cc @@ -24,6 +24,7 @@ #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/tokenizer-factory.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -49,8 +50,10 @@ TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) { } TEST_F(PlainTokenizerTest, Simple) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -81,8 +84,10 @@ TEST_F(PlainTokenizerTest, Simple) { } TEST_F(PlainTokenizerTest, Whitespace) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -107,8 +112,10 @@ TEST_F(PlainTokenizerTest, Whitespace) { } TEST_F(PlainTokenizerTest, Punctuation) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -136,8 +143,10 @@ TEST_F(PlainTokenizerTest, Punctuation) { } TEST_F(PlainTokenizerTest, SpecialCharacters) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -157,8 +166,10 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) { } TEST_F(PlainTokenizerTest, CJKT) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -209,8 +220,10 @@ TEST_F(PlainTokenizerTest, CJKT) { } TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -226,8 +239,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { } TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -243,8 +258,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { } TEST_F(PlainTokenizerTest, ResetToTokenAfter) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( @@ -291,8 +308,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) { } TEST_F(PlainTokenizerTest, ResetToTokenBefore) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> plain_tokenizer, tokenizer_factory::CreateIndexingTokenizer( diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index 351f7c1..9b71e8a 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -22,6 +22,7 @@ #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/tokenizer-factory.h" #include "icing/tokenization/tokenizer.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -46,8 +47,10 @@ TEST_F(RawQueryTokenizerTest, CreationWithNullPointerShouldFail) { } TEST_F(RawQueryTokenizerTest, Simple) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -59,8 +62,10 @@ TEST_F(RawQueryTokenizerTest, Simple) { } TEST_F(RawQueryTokenizerTest, Parentheses) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -159,8 +164,10 @@ TEST_F(RawQueryTokenizerTest, Parentheses) { } TEST_F(RawQueryTokenizerTest, Exclustion) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -226,8 +233,10 @@ TEST_F(RawQueryTokenizerTest, Exclustion) { } TEST_F(RawQueryTokenizerTest, PropertyRestriction) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -314,8 +323,10 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) { } TEST_F(RawQueryTokenizerTest, OR) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -435,8 +446,10 @@ TEST_F(RawQueryTokenizerTest, OR) { // CJKT are treated the same way by language segmenter and raw tokenizer, so // here we test Chinese and Japanese to represent CJKT. TEST_F(RawQueryTokenizerTest, CJKT) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -488,8 +501,10 @@ TEST_F(RawQueryTokenizerTest, CJKT) { // Raw tokenizer identifies all characters that it doesn't know as OTHER type, // so we can choose comma "," to represent all OTHER characters. TEST_F(RawQueryTokenizerTest, OtherChars) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, @@ -533,8 +548,10 @@ TEST_F(RawQueryTokenizerTest, OtherChars) { } TEST_F(RawQueryTokenizerTest, Mix) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Tokenizer> raw_query_tokenizer, tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc index f79bc68..db973f3 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "icing/jni/jni-cache.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h" #include "icing/util/logging.h" +#include "unicode/uloc.h" namespace icing { namespace lib { diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc index a01d944..4b50231 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc @@ -443,6 +443,74 @@ TEST_P(ReverseJniLanguageSegmenterTest, NotCopyStrings) { EXPECT_THAT(word2_address, Eq(word2_result_address)); } +TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("How")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + ASSERT_TRUE(itr->Advance()); // itr points to 'How' + EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("How")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, IteratorMultipleAdvancesResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + ASSERT_TRUE(itr->Advance()); + ASSERT_TRUE(itr->Advance()); + ASSERT_TRUE(itr->Advance()); + ASSERT_TRUE(itr->Advance()); // itr points to ' ' + EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("How")); +} + +TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + constexpr std::string_view kText = "How are you你好吗お元気ですか"; + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText)); + + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // Bytes: 0 3 4 7 8 11 172023 29 35 + while (itr->Advance()) { + // Do nothing. + } + EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->GetTerm(), Eq("How")); +} + TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( @@ -1060,6 +1128,21 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) { EXPECT_THAT(itr->GetTerm(), Eq("ไป")); } +TEST_P(ReverseJniLanguageSegmenterTest, QuerySyntax) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + // Validates that the input strings are not copied + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<std::string_view> terms, + language_segmenter->GetAllTerms( + "(-term1 OR term2) AND property1.subproperty2:term3")); + EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2", + ")", " ", "AND", " ", "property1", ".", + "subproperty2", ":", "term3")); +} + INSTANTIATE_TEST_SUITE_P( LocaleName, ReverseJniLanguageSegmenterTest, testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH, diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index 2256022..bb26364 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -24,164 +24,13 @@ #include "icing/absl_ports/canonical_errors.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/tokenization/language-segmenter.h" +#include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" namespace icing { namespace lib { -namespace { - -// Returns the lead byte of the UTF-8 character that includes the byte at -// current_byte_index within it. -int GetUTF8StartPosition(std::string_view text, int current_byte_index) { - while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) { - --current_byte_index; - } - return current_byte_index; -} - -class CharacterIterator { - public: - explicit CharacterIterator(std::string_view text) - : CharacterIterator(text, 0, 0) {} - CharacterIterator(std::string_view text, int utf8_index, int utf16_index) - : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {} - - // Moves from current position to the character that includes the specified - // UTF-8 index. - // REQUIRES: desired_utf8_index <= text_.length() - // desired_utf8_index is allowed to point one index past the end, but no - // further. - bool AdvanceToUtf8(int desired_utf8_index) { - if (desired_utf8_index > text_.length()) { - // Enforce the requirement. - return false; - } - // Need to work forwards. - while (utf8_index_ < desired_utf8_index) { - UChar32 uchar32 = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); - if (uchar32 == i18n_utils::kInvalidUChar32) { - // Unable to retrieve a valid UTF-32 character at the previous position. - return false; - } - int utf8_length = i18n_utils::GetUtf8Length(uchar32); - if (utf8_index_ + utf8_length > desired_utf8_index) { - // Ah! Don't go too far! - break; - } - utf8_index_ += utf8_length; - utf16_index_ += i18n_utils::GetUtf16Length(uchar32); - } - return true; - } - - // Moves from current position to the character that includes the specified - // UTF-8 index. - // REQUIRES: 0 <= desired_utf8_index - bool RewindToUtf8(int desired_utf8_index) { - if (desired_utf8_index < 0) { - // Enforce the requirement. - return false; - } - // Need to work backwards. - while (utf8_index_ > desired_utf8_index) { - --utf8_index_; - utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); - if (utf8_index_ < 0) { - // Somehow, there wasn't a single UTF-8 lead byte at - // requested_byte_index or an earlier byte. - return false; - } - // We've found the start of a unicode char! - UChar32 uchar32 = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); - if (uchar32 == i18n_utils::kInvalidUChar32) { - // Unable to retrieve a valid UTF-32 character at the previous position. - return false; - } - utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); - } - return true; - } - - // Advances current position to desired_utf16_index. - // REQUIRES: desired_utf16_index <= text_.utf16_length() - // desired_utf16_index is allowed to point one index past the end, but no - // further. - bool AdvanceToUtf16(int desired_utf16_index) { - while (utf16_index_ < desired_utf16_index) { - UChar32 uchar32 = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); - if (uchar32 == i18n_utils::kInvalidUChar32) { - // Unable to retrieve a valid UTF-32 character at the previous position. - return false; - } - int utf16_length = i18n_utils::GetUtf16Length(uchar32); - if (utf16_index_ + utf16_length > desired_utf16_index) { - // Ah! Don't go too far! - break; - } - int utf8_length = i18n_utils::GetUtf8Length(uchar32); - if (utf8_index_ + utf8_length > text_.length()) { - // Enforce the requirement. - return false; - } - utf8_index_ += utf8_length; - utf16_index_ += utf16_length; - } - return true; - } - - // Rewinds current position to desired_utf16_index. - // REQUIRES: 0 <= desired_utf16_index - bool RewindToUtf16(int desired_utf16_index) { - if (desired_utf16_index < 0) { - return false; - } - while (utf16_index_ > desired_utf16_index) { - --utf8_index_; - utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); - // We've found the start of a unicode char! - UChar32 uchar32 = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); - if (uchar32 == i18n_utils::kInvalidUChar32) { - // Unable to retrieve a valid UTF-32 character at the previous position. - return false; - } - utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); - } - return true; - } - - bool IsValidCharacter() const { - // Rule 1: all ASCII terms will be returned. - // We know it's a ASCII term by checking the first char. - if (i18n_utils::IsAscii(text_[utf8_index_])) { - return true; - } - - // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. - // We know it's an alphabetic term by checking the first unicode character. - if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) { - return true; - } - - return false; - } - - int utf8_index() const { return utf8_index_; } - int utf16_index() const { return utf16_index_; } - - private: - std::string_view text_; - int utf8_index_; - int utf16_index_; -}; - -} // namespace - class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { public: explicit ReverseJniLanguageSegmenterIterator( @@ -229,7 +78,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Check if the current term is valid. We consider any term valid if its // first character is valid. If it's not valid, then we need to advance to // the next term. - if (term_start_.IsValidCharacter()) { + if (IsValidTerm()) { return true; } return Advance(); @@ -382,8 +231,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // 4. The start and end indices point to a segment, but we need to ensure // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll // need a segment prior to this one. - if (term_end_exclusive_.utf8_index() > offset || - !term_start_.IsValidCharacter()) { + if (term_end_exclusive_.utf8_index() > offset || !IsValidTerm()) { return ResetToTermEndingBefore(term_start_.utf8_index()); } return term_start_.utf8_index(); @@ -414,6 +262,21 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { /*utf16_index=*/ReverseJniBreakIterator::kDone); } + bool IsValidTerm() const { + // Rule 1: all ASCII terms will be returned. + // We know it's a ASCII term by checking the first char. + if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) { + return true; + } + + // Rule 2: for non-ASCII terms, only the alphabetic terms are returned. + // We know it's an alphabetic term by checking the first unicode character. + if (i18n_utils::IsAlphabeticAt(text_, term_start_.utf8_index())) { + return true; + } + return false; + } + // All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So // this class needs to maintain state to convert between UTF-16 and UTF-8. std::unique_ptr<ReverseJniBreakIterator> break_iterator_; diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc index 8ed38b2..6c5e3f6 100644 --- a/icing/tokenization/simple/space-language-segmenter_test.cc +++ b/icing/tokenization/simple/space-language-segmenter_test.cc @@ -18,6 +18,7 @@ #include "icing/testing/common-matchers.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -28,21 +29,27 @@ using ::testing::Eq; using ::testing::IsEmpty; TEST(SpaceLanguageSegmenterTest, EmptyText) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty())); } TEST(SpaceLanguageSegmenterTest, SimpleText) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"), IsOkAndHolds(ElementsAre("Hello", " ", "World"))); } TEST(SpaceLanguageSegmenterTest, Punctuation) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"), IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!"))); @@ -55,8 +62,10 @@ TEST(SpaceLanguageSegmenterTest, Punctuation) { } TEST(SpaceLanguageSegmenterTest, Alphanumeric) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); // Alphanumeric terms are allowed EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"), @@ -64,8 +73,10 @@ TEST(SpaceLanguageSegmenterTest, Alphanumeric) { } TEST(SpaceLanguageSegmenterTest, Number) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); // Alphanumeric terms are allowed EXPECT_THAT( @@ -80,8 +91,10 @@ TEST(SpaceLanguageSegmenterTest, Number) { } TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); // Multiple continuous whitespaces are treated as one. const int kNumSeparators = 256; @@ -92,8 +105,10 @@ TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) { } TEST(SpaceLanguageSegmenterTest, NotCopyStrings) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create()); + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); // Validates that the input strings are not copied const std::string text = "Hello World"; const char* word1_address = text.c_str(); diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc new file mode 100644 index 0000000..3707f95 --- /dev/null +++ b/icing/util/character-iterator.cc @@ -0,0 +1,127 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/util/character-iterator.h" + +namespace icing { +namespace lib { + +namespace { + +// Returns the lead byte of the UTF-8 character that includes the byte at +// current_byte_index within it. +int GetUTF8StartPosition(std::string_view text, int current_byte_index) { + while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) { + --current_byte_index; + } + return current_byte_index; +} + +} // namespace + +bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) { + if (desired_utf8_index > text_.length()) { + // Enforce the requirement. + return false; + } + // Need to work forwards. + while (utf8_index_ < desired_utf8_index) { + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. + return false; + } + int utf8_length = i18n_utils::GetUtf8Length(uchar32); + if (utf8_index_ + utf8_length > desired_utf8_index) { + // Ah! Don't go too far! + break; + } + utf8_index_ += utf8_length; + utf16_index_ += i18n_utils::GetUtf16Length(uchar32); + } + return true; +} + +bool CharacterIterator::RewindToUtf8(int desired_utf8_index) { + if (desired_utf8_index < 0) { + // Enforce the requirement. + return false; + } + // Need to work backwards. + while (utf8_index_ > desired_utf8_index) { + --utf8_index_; + utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); + if (utf8_index_ < 0) { + // Somehow, there wasn't a single UTF-8 lead byte at + // requested_byte_index or an earlier byte. + return false; + } + // We've found the start of a unicode char! + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. + return false; + } + utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); + } + return true; +} + +bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) { + while (utf16_index_ < desired_utf16_index) { + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. + return false; + } + int utf16_length = i18n_utils::GetUtf16Length(uchar32); + if (utf16_index_ + utf16_length > desired_utf16_index) { + // Ah! Don't go too far! + break; + } + int utf8_length = i18n_utils::GetUtf8Length(uchar32); + if (utf8_index_ + utf8_length > text_.length()) { + // Enforce the requirement. + return false; + } + utf8_index_ += utf8_length; + utf16_index_ += utf16_length; + } + return true; +} + +bool CharacterIterator::RewindToUtf16(int desired_utf16_index) { + if (desired_utf16_index < 0) { + return false; + } + while (utf16_index_ > desired_utf16_index) { + --utf8_index_; + utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); + // We've found the start of a unicode char! + UChar32 uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + if (uchar32 == i18n_utils::kInvalidUChar32) { + // Unable to retrieve a valid UTF-32 character at the previous position. + return false; + } + utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); + } + return true; +} + +} // namespace lib +} // namespace icing diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h new file mode 100644 index 0000000..22de6c5 --- /dev/null +++ b/icing/util/character-iterator.h @@ -0,0 +1,70 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_UTIL_CHARACTER_ITERATOR_H_ +#define ICING_UTIL_CHARACTER_ITERATOR_H_ + +#include "icing/util/i18n-utils.h" + +namespace icing { +namespace lib { + +class CharacterIterator { + public: + explicit CharacterIterator(std::string_view text) + : CharacterIterator(text, 0, 0) {} + + CharacterIterator(std::string_view text, int utf8_index, int utf16_index) + : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {} + + // Moves from current position to the character that includes the specified + // UTF-8 index. + // REQUIRES: desired_utf8_index <= text_.length() + // desired_utf8_index is allowed to point one index past the end, but no + // further. + bool AdvanceToUtf8(int desired_utf8_index); + + // Moves from current position to the character that includes the specified + // UTF-8 index. + // REQUIRES: 0 <= desired_utf8_index + bool RewindToUtf8(int desired_utf8_index); + + // Advances current position to desired_utf16_index. + // REQUIRES: desired_utf16_index <= text_.utf16_length() + // desired_utf16_index is allowed to point one index past the end, but no + // further. + bool AdvanceToUtf16(int desired_utf16_index); + + // Rewinds current position to desired_utf16_index. + // REQUIRES: 0 <= desired_utf16_index + bool RewindToUtf16(int desired_utf16_index); + + int utf8_index() const { return utf8_index_; } + int utf16_index() const { return utf16_index_; } + + bool operator==(const CharacterIterator& rhs) const { + return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ && + utf16_index_ == rhs.utf16_index_; + } + + private: + std::string_view text_; + int utf8_index_; + int utf16_index_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_UTIL_CHARACTER_ITERATOR_H_ diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc index 9cf992f..d6754d5 100644 --- a/icing/util/i18n-utils.cc +++ b/icing/util/i18n-utils.cc @@ -99,16 +99,17 @@ void SafeTruncateUtf8(std::string* str, int truncate_to_length) { return; } - while (truncate_to_length > 0) { - if (IsLeadUtf8Byte(str->at(truncate_to_length))) { - str->resize(truncate_to_length); - return; + str->resize(SafeTruncateUtf8Length(str->c_str(), truncate_to_length)); +} + +int SafeTruncateUtf8Length(const char* str, int desired_length) { + while (desired_length > 0) { + if (IsLeadUtf8Byte(str[desired_length])) { + break; } - truncate_to_length--; + --desired_length; } - - // Truncates to an empty string - str->resize(0); + return desired_length; } bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); } diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h index e103bab..82ae828 100644 --- a/icing/util/i18n-utils.h +++ b/icing/util/i18n-utils.h @@ -50,6 +50,13 @@ libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16( // Returns the char at the given position. UChar32 GetUChar32At(const char* data, int length, int position); +// Returns the safe position to truncate a UTF8 string at so that multi-byte +// UTF8 characters are not cut in the middle. The returned value will always be +// 0 <= val <= desired_length. +// +// REQUIRES: 0 <= desired_length < strlen(str) +int SafeTruncateUtf8Length(const char* str, int desired_length); + // Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut // in the middle. The string will be truncated in place. void SafeTruncateUtf8(std::string* str, int truncate_to_length); diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java index 7be631c..76fa33d 100644 --- a/java/src/com/google/android/icing/IcingSearchEngine.java +++ b/java/src/com/google/android/icing/IcingSearchEngine.java @@ -328,6 +328,27 @@ public final class IcingSearchEngine { } @NonNull + public DeleteResultProto deleteByQuery(@NonNull SearchSpecProto searchSpec) { + byte[] deleteResultBytes = nativeDeleteByQuery(nativePointer, searchSpec.toByteArray()); + if (deleteResultBytes == null) { + Log.e(TAG, "Received null DeleteResultProto from native."); + return DeleteResultProto.newBuilder() + .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL)) + .build(); + } + + try { + return DeleteResultProto.parseFrom( + deleteResultBytes, EXTENSION_REGISTRY_LITE); + } catch (InvalidProtocolBufferException e) { + Log.e(TAG, "Error parsing DeleteResultProto.", e); + return DeleteResultProto.newBuilder() + .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL)) + .build(); + } + } + + @NonNull public PersistToDiskResultProto persistToDisk() { byte[] persistToDiskResultBytes = nativePersistToDisk(nativePointer); if (persistToDiskResultBytes == null) { @@ -438,6 +459,8 @@ public final class IcingSearchEngine { private static native byte[] nativeDeleteBySchemaType(long nativePointer, String schemaType); + private static native byte[] nativeDeleteByQuery(long nativePointer, byte[] searchSpecBytes); + private static native byte[] nativePersistToDisk(long nativePointer); private static native byte[] nativeOptimize(long nativePointer); diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java index d907d4e..fb77d6e 100644 --- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java +++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java @@ -335,6 +335,58 @@ public final class IcingSearchEngineTest { assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND); } + + @Test + public void testDeleteByQuery() throws Exception { + IcingSearchEngineOptions options = + IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build(); + IcingSearchEngine icing = new IcingSearchEngine(options); + assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK); + + SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig(); + SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build(); + assertThat( + icing + .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false) + .getStatus() + .getCode()) + .isEqualTo(StatusProto.Code.OK); + + DocumentProto emailDocument1 = + createEmailDocument("namespace", "uri1").toBuilder() + .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo")) + .build();; + assertThat(icing.put(emailDocument1).getStatus().getCode()).isEqualTo(StatusProto.Code.OK); + DocumentProto emailDocument2 = + createEmailDocument("namespace", "uri2").toBuilder() + .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("bar")) + .build();; + assertThat(icing.put(emailDocument2).getStatus().getCode()).isEqualTo(StatusProto.Code.OK); + + SearchSpecProto searchSpec = + SearchSpecProto.newBuilder() + .setQuery("foo") + .setTermMatchType(TermMatchType.Code.PREFIX) + .build(); + + SearchResultProto searchResultProto = + icing.search( + searchSpec, + ScoringSpecProto.getDefaultInstance(), + ResultSpecProto.getDefaultInstance()); + assertThat(searchResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK); + assertThat(searchResultProto.getResultsCount()).isEqualTo(1); + assertThat(searchResultProto.getResults(0).getDocument()).isEqualTo(emailDocument1); + + DeleteResultProto deleteResultProto = icing.deleteByQuery(searchSpec); + assertThat(deleteResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK); + + GetResultProto getResultProto = icing.get("namespace", "uri1"); + assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND); + getResultProto = icing.get("namespace", "uri2"); + assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK); + } + @Test public void testPersistToDisk() throws Exception { IcingSearchEngineOptions options = diff --git a/proto/icing/proto/status.proto b/proto/icing/proto/status.proto index 2733a15..08677b0 100644 --- a/proto/icing/proto/status.proto +++ b/proto/icing/proto/status.proto @@ -24,7 +24,7 @@ option objc_class_prefix = "ICNG"; // Canonical status to indicate the results of API calls. // Next tag: 3 message StatusProto { - // Next tag: 9 + // Next tag: 10 enum Code { // A default for all other use-cases. Should never be used in practice. This // may happen if there are backwards-compatibility issues. @@ -62,6 +62,12 @@ message StatusProto { // make some space on the underlying filesystem. OUT_OF_SPACE = 8; + // An operation is invalid because the resource already exists and can't be + // replaced. For example, this status is used when a SchemaProto contains + // multiple definitions of the same type or multiple properties with the + // same name within a type. + ALREADY_EXISTS = 9; + // Any future status codes. } optional Code code = 1; diff --git a/proto/icing/proto/usage.proto b/proto/icing/proto/usage.proto new file mode 100644 index 0000000..81243f0 --- /dev/null +++ b/proto/icing/proto/usage.proto @@ -0,0 +1,53 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package icing.lib; + +option java_package = "com.google.android.icing.proto"; +option java_multiple_files = true; + +option objc_class_prefix = "ICNG"; + +// Representation of a usage report that is generated from the client and sent +// to Icing. +// Next tag: 5 +// LINT.IfChange +message UsageReport { + // Namespace of the document. + optional string document_namespace = 1; + + // Uri of the document. + optional string document_uri = 2; + + // Timestamp in milliseconds of when the usage happens. + optional int64 usage_timestamp_ms = 3; + + // Next tag: 3 + enum UsageType { + // A custom usage type that clients can assign a meaning to. UsageReports of + // the same type are combined to provide usage counts that clients may use + // in scoring. + USAGE_TYPE1 = 0; + + // Same as above. + USAGE_TYPE2 = 1; + + // Same as above. + USAGE_TYPE3 = 2; + } + optional UsageType usage_type = 4; +} +// LINT.ThenChange(//depot/google3/icing/store/usage-store.h:UsageScores) |