diff options
author | Alexander Dorokhine <adorokhine@google.com> | 2022-03-22 22:55:15 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2022-03-23 17:04:14 +0000 |
commit | 9ab600c39d0b5c87fc7dc4d8155d1efb535f1608 (patch) | |
tree | 93ed846d985900e348c166b14818348705d46ea9 | |
parent | 19600c2c36c5add7e7a792b7e4f742d45b3f871f (diff) | |
parent | c4f46ed536752b4c07f7696e65ff79c2d5086f3f (diff) | |
download | icing-9ab600c39d0b5c87fc7dc4d8155d1efb535f1608.tar.gz |
Merge remote-tracking branch 'goog/androidx-platform-dev' into tm-dev
* goog/androidx-platform-dev:
Sync from upstream.
Sync from upstream.
Sync from upstream.
Descriptions:
======================================================================
Add some additional logging that will help diagnose b/218413237
======================================================================
Mark VerbatimTokenizer::ResetToTokenStartingAfter as 'override'.
======================================================================
Support dump function for SchemaStore
======================================================================
Refactor DocumentStore::Initialize to improve readability of document store recovery.
======================================================================
Remove non-NDK API usages of ICU4C in libicing.
======================================================================
Move IcuDataFileHelper to the testing directory since it is a test-only util.
======================================================================
Support dump function for DocumentStore
======================================================================
Switch to use PRead rather than MMap in the proto log.
======================================================================
Support dump function for main/lite index and lexicon
======================================================================
Fix LiteIndex::AppendHits
======================================================================
Enable and fix DocumentStoreTest.LoadScoreCacheAndInitializeSuccessfully
======================================================================
Fix MainIndex::GetStorageInfo.
======================================================================
Fix icing-search-engine_fuzz_test by making IcuLanguageSegmenterIterator::Advance non-recursive.
======================================================================
Allow to return additional information for deleted documents in DeleteByQuery
======================================================================
Using enum class in Token::Type for better type safety.
======================================================================
Normalize Tokens by Token type when retrieving snippets
================
Rename max_window_bytes to max_window_utf32_length,
Delete the max_tokens_per_doc field in IcingSearchEngineOptions.
================
Handle suggestion namespace ownership.
================
Fix OkStatus() is not a valid argument to StatusOr in
Main_index.RetrieveMoreHits.
================
Allow advancing when current indices are negative in CharacterIterator
================
Adds support for verbatim tokenization and indexing in IcingLib
================
Renames TokenizerIterator Reset functions
================
Add term_match_type to SuggestionSpec proto
================
Unify the C++ proto enum style
================
Allow zero property weights in IcingLib
Bug: 152934343
Bug: 158089703
Bug: 185845269
Bug: 203700301
Bug: 204333391
Bug: 205209589
Bug: 206147728
Bug: 209071710
Bug: 209993976
Bug: 218413237
Bug: 218413237
Bug: 223549255
Test: Presubmit
Change-Id: I96665ba718f89e69ca99cd833ad80fa555edf436
90 files changed, 3432 insertions, 1355 deletions
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc index f1e568c..eec7668 100644 --- a/icing/file/file-backed-bitmap.cc +++ b/icing/file/file-backed-bitmap.cc @@ -50,7 +50,7 @@ FileBackedBitmap::Create(const Filesystem* filesystem, auto bitmap = std::unique_ptr<FileBackedBitmap>( new FileBackedBitmap(filesystem, file_path, mmap_strategy)); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = bitmap->Initialize(); if (!status.ok()) { @@ -122,7 +122,7 @@ libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() { << " of size: " << file_size; } - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, file_size); if (!status.ok()) { @@ -198,7 +198,7 @@ int FileBackedBitmap::NumBits() const { libtextclassifier3::Status FileBackedBitmap::Set(int bit_index, bool bit_value) { if (bit_index >= NumBits()) { - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = GrowTo(bit_index); if (!status.ok()) { @@ -261,7 +261,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) { file_path_.c_str(), new_file_size)); } - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size); if (!status.ok()) { @@ -281,7 +281,7 @@ libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) { } const size_t new_file_size = FileSizeForBits(new_num_bits); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size); if (!status.ok()) { diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h index ca8c4a8..dd2c5d1 100644 --- a/icing/file/filesystem.h +++ b/icing/file/filesystem.h @@ -233,6 +233,11 @@ class Filesystem { // Increments to_increment by size if size is valid, or sets to_increment // to kBadFileSize if either size or to_increment is kBadFileSize. static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment); + + // Return -1 if file_size is invalid. Otherwise, return file_size. + static int64_t SanitizeFileSize(int64_t file_size) { + return (file_size != kBadFileSize) ? file_size : -1; + } }; // LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h) diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h index f676dc5..409ab96 100644 --- a/icing/file/portable-file-backed-proto-log.h +++ b/icing/file/portable-file-backed-proto-log.h @@ -124,6 +124,8 @@ class PortableFileBackedProtoLog { public: static constexpr int32_t kMagic = 0xf4c6f67a; + // We should go directly from 0 to 2 the next time we have to change the + // format. static constexpr int32_t kFileFormatVersion = 0; uint32_t CalculateHeaderChecksum() const { @@ -282,7 +284,7 @@ class PortableFileBackedProtoLog { // before updating our checksum. bool recalculated_checksum = false; - bool has_data_loss() { + bool has_data_loss() const { return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE; } }; @@ -376,8 +378,7 @@ class PortableFileBackedProtoLog { // } class Iterator { public: - Iterator(const Filesystem& filesystem, const std::string& file_path, - int64_t initial_offset); + Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset); // Advances to the position of next proto whether it has been erased or not. // @@ -393,11 +394,12 @@ class PortableFileBackedProtoLog { private: static constexpr int64_t kInvalidOffset = -1; // Used to read proto metadata - MemoryMappedFile mmapped_file_; // Offset of first proto + const Filesystem* const filesystem_; int64_t initial_offset_; int64_t current_offset_; int64_t file_size_; + int fd_; }; // Returns an iterator of current proto log. The caller needs to keep the @@ -513,7 +515,7 @@ class PortableFileBackedProtoLog { const Filesystem* filesystem, const std::string& file_path, Crc32 initial_crc, int64_t start, int64_t end); - // Reads out the metadata of a proto located at file_offset from the file. + // Reads out the metadata of a proto located at file_offset from the fd. // Metadata will be returned in host byte order endianness. // // Returns: @@ -521,7 +523,8 @@ class PortableFileBackedProtoLog { // OUT_OF_RANGE_ERROR if file_offset exceeds file_size // INTERNAL_ERROR if the metadata is invalid or any IO errors happen static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata( - MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); + const Filesystem* const filesystem, int fd, int64_t file_offset, + int64_t file_size); // Writes metadata of a proto to the fd. Takes in a host byte order endianness // metadata and converts it into a portable metadata before writing. @@ -937,35 +940,37 @@ template <typename ProtoT> libtextclassifier3::StatusOr<ProtoT> PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const { int64_t file_size = filesystem_->GetFileSize(fd_.get()); - MemoryMappedFile mmapped_file(*filesystem_, file_path_, - MemoryMappedFile::Strategy::READ_ONLY); - if (file_offset >= file_size) { - // file_size points to the next byte to write at, so subtract one to get - // the inclusive, actual size of file. - return absl_ports::OutOfRangeError( - IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " - "out of range of the file size, %lld", - static_cast<long long>(file_offset), - static_cast<long long>(file_size - 1))); - } - // Read out the metadata + if (file_size == Filesystem::kBadFileSize) { + return absl_ports::OutOfRangeError("Unable to correctly read size."); + } ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size)); // Copy out however many bytes it says the proto is int stored_size = GetProtoSize(metadata); + file_offset += sizeof(metadata); - ICING_RETURN_IF_ERROR( - mmapped_file.Remap(file_offset + sizeof(metadata), stored_size)); + // Read the compressed proto out. + if (file_offset + stored_size > file_size) { + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + auto buf = std::make_unique<char[]>(stored_size); + if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) { + return absl_ports::InternalError(""); + } - if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) { + if (IsEmptyBuffer(buf.get(), stored_size)) { return absl_ports::NotFoundError("The proto data has been erased."); } - google::protobuf::io::ArrayInputStream proto_stream( - mmapped_file.mutable_region(), stored_size); + google::protobuf::io::ArrayInputStream proto_stream(buf.get(), + stored_size); // Deserialize proto ProtoT proto; @@ -983,33 +988,29 @@ template <typename ProtoT> libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto( int64_t file_offset) { int64_t file_size = filesystem_->GetFileSize(fd_.get()); - if (file_offset >= file_size) { - // file_size points to the next byte to write at, so subtract one to get - // the inclusive, actual size of file. - return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( - "Trying to erase data at a location, %lld, " - "out of range of the file size, %lld", - static_cast<long long>(file_offset), - static_cast<long long>(file_size - 1))); + if (file_size == Filesystem::kBadFileSize) { + return absl_ports::OutOfRangeError("Unable to correctly read size."); } - MemoryMappedFile mmapped_file( - *filesystem_, file_path_, - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); - - // Read out the metadata ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(&mmapped_file, file_offset, file_size)); - - ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), - GetProtoSize(metadata))); + ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size)); + // Copy out however many bytes it says the proto is + int stored_size = GetProtoSize(metadata); + file_offset += sizeof(metadata); + if (file_offset + stored_size > file_size) { + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + auto buf = std::make_unique<char[]>(stored_size); // We need to update the crc checksum if the erased area is before the // rewind position. int32_t new_crc; - int64_t erased_proto_offset = file_offset + sizeof(metadata); - if (erased_proto_offset < header_->GetRewindOffset()) { + if (file_offset < header_->GetRewindOffset()) { // Set to "dirty" before we start writing anything. header_->SetDirtyFlag(true); header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); @@ -1022,24 +1023,30 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto( // We need to calculate [original string xor 0s]. // The xored string is the same as the original string because 0 xor 0 = // 0, 1 xor 0 = 1. - const std::string_view xored_str(mmapped_file.region(), - mmapped_file.region_size()); + // Read the compressed proto out. + if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) { + return absl_ports::InternalError(""); + } + const std::string_view xored_str(buf.get(), stored_size); Crc32 crc(header_->GetLogChecksum()); ICING_ASSIGN_OR_RETURN( - new_crc, crc.UpdateWithXor( - xored_str, - /*full_data_size=*/header_->GetRewindOffset() - - kHeaderReservedBytes, - /*position=*/erased_proto_offset - kHeaderReservedBytes)); + new_crc, + crc.UpdateWithXor(xored_str, + /*full_data_size=*/header_->GetRewindOffset() - + kHeaderReservedBytes, + /*position=*/file_offset - kHeaderReservedBytes)); } // Clear the region. - memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size()); + memset(buf.get(), '\0', stored_size); + if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) { + return absl_ports::InternalError(""); + } // If we cleared something in our checksummed area, we should update our // checksum and reset our dirty bit. - if (erased_proto_offset < header_->GetRewindOffset()) { + if (file_offset < header_->GetRewindOffset()) { header_->SetDirtyFlag(false); header_->SetLogChecksum(new_crc); header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); @@ -1077,13 +1084,12 @@ PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const { template <typename ProtoT> PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator( - const Filesystem& filesystem, const std::string& file_path, - int64_t initial_offset) - : mmapped_file_(filesystem, file_path, - MemoryMappedFile::Strategy::READ_ONLY), + const Filesystem& filesystem, int fd, int64_t initial_offset) + : filesystem_(&filesystem), initial_offset_(initial_offset), current_offset_(kInvalidOffset), - file_size_(filesystem.GetFileSize(file_path.c_str())) { + fd_(fd) { + file_size_ = filesystem_->GetFileSize(fd_); if (file_size_ == Filesystem::kBadFileSize) { // Fails all Advance() calls file_size_ = 0; @@ -1100,7 +1106,7 @@ PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() { // Jumps to the next proto position ICING_ASSIGN_OR_RETURN( int32_t metadata, - ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_)); + ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_)); current_offset_ += sizeof(metadata) + GetProtoSize(metadata); } @@ -1122,14 +1128,15 @@ int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() { template <typename ProtoT> typename PortableFileBackedProtoLog<ProtoT>::Iterator PortableFileBackedProtoLog<ProtoT>::GetIterator() { - return Iterator(*filesystem_, file_path_, + return Iterator(*filesystem_, fd_.get(), /*initial_offset=*/kHeaderReservedBytes); } template <typename ProtoT> libtextclassifier3::StatusOr<int32_t> PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata( - MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) { + const Filesystem* const filesystem, int fd, int64_t file_offset, + int64_t file_size) { // Checks file_offset if (file_offset >= file_size) { return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( @@ -1147,9 +1154,9 @@ PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata( static_cast<long long>(file_size))); } - // Reads metadata - ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size)); - memcpy(&portable_metadata, mmapped_file->region(), metadata_size); + if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) { + return absl_ports::InternalError(""); + } // Need to switch it back to host order endianness after reading from disk. int32_t host_order_metadata = GNetworkToHostL(portable_metadata); diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc index f83ccd6..80a8011 100644 --- a/icing/file/portable-file-backed-proto-log_benchmark.cc +++ b/icing/file/portable-file-backed-proto-log_benchmark.cc @@ -55,7 +55,7 @@ namespace lib { namespace { -static void BM_Write(benchmark::State& state) { +void BM_Write(benchmark::State& state) { const Filesystem filesystem; int string_length = state.range(0); const std::string file_path = IcingStringUtil::StringPrintf( @@ -108,7 +108,7 @@ BENCHMARK(BM_Write) // 16MiB, and we need some extra space for the // rest of the document properties -static void BM_Read(benchmark::State& state) { +void BM_Read(benchmark::State& state) { const Filesystem filesystem; int string_length = state.range(0); const std::string file_path = IcingStringUtil::StringPrintf( @@ -164,7 +164,7 @@ BENCHMARK(BM_Read) // 16MiB, and we need some extra space for the // rest of the document properties // -static void BM_Erase(benchmark::State& state) { +void BM_Erase(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = IcingStringUtil::StringPrintf( "%s%s", GetTestTempDir().c_str(), "/proto.log"); @@ -204,7 +204,7 @@ static void BM_Erase(benchmark::State& state) { } BENCHMARK(BM_Erase); -static void BM_ComputeChecksum(benchmark::State& state) { +void BM_ComputeChecksum(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = GetTestTempDir() + "/proto.log"; int max_proto_size = (1 << 24) - 1; // 16 MiB @@ -246,7 +246,7 @@ static void BM_ComputeChecksum(benchmark::State& state) { } BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20); -static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) { +void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = GetTestTempDir() + "/proto.log"; int max_proto_size = (1 << 24) - 1; // 16 MiB @@ -290,7 +290,7 @@ static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) { } BENCHMARK(BM_ComputeChecksumWithCachedChecksum); -static void BM_ComputeChecksumOnlyForTail(benchmark::State& state) { +void BM_ComputeChecksumOnlyForTail(benchmark::State& state) { const Filesystem filesystem; const std::string file_path = GetTestTempDir() + "/proto.log"; int max_proto_size = (1 << 24) - 1; // 16 MiB diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc index b5fee4b..795271a 100644 --- a/icing/file/portable-file-backed-proto-log_test.cc +++ b/icing/file/portable-file-backed-proto-log_test.cc @@ -851,11 +851,12 @@ TEST_F(PortableFileBackedProtoLogTest, Iterator) { { // Iterator with bad filesystem + ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str())); MockFilesystem mock_filesystem; - ON_CALL(mock_filesystem, GetFileSize(A<const char*>())) + ON_CALL(mock_filesystem, GetFileSize(A<int>())) .WillByDefault(Return(Filesystem::kBadFileSize)); PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator( - mock_filesystem, file_path_, /*initial_offset=*/0); + mock_filesystem, sfd.get(), /*initial_offset=*/0); ASSERT_THAT(bad_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); } diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc index 48e81e5..1012b47 100644 --- a/icing/icing-search-engine-with-icu-file_test.cc +++ b/icing/icing-search-engine-with-icu-file_test.cc @@ -37,13 +37,13 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::Eq; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = - PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = + PropertyConfigProto::Cardinality::REQUIRED; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; std::string GetTestBaseDir() { return GetTestTempDir() + "/icing_with_icu_files"; diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 9aa833b..952ba21 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -18,6 +18,7 @@ #include <memory> #include <string> #include <string_view> +#include <unordered_map> #include <utility> #include <vector> @@ -59,6 +60,7 @@ #include "icing/scoring/scoring-processor.h" #include "icing/store/document-id.h" #include "icing/store/document-store.h" +#include "icing/store/namespace-checker-impl.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer-factory.h" @@ -87,17 +89,22 @@ constexpr std::string_view kOptimizeStatusFilename = "optimize_status"; // fresh state. constexpr int kMaxUnsuccessfulInitAttempts = 5; -libtextclassifier3::Status ValidateOptions( - const IcingSearchEngineOptions& options) { - // These options are only used in IndexProcessor, which won't be created - // until the first Put call. So they must be checked here, so that any - // errors can be surfaced in Initialize. - if (options.max_tokens_per_doc() <= 0) { - return absl_ports::InvalidArgumentError( - "Options::max_tokens_per_doc must be greater than zero."); +// A pair that holds namespace and type. +struct NamespaceTypePair { + std::string namespace_; + std::string type; + + bool operator==(const NamespaceTypePair& other) const { + return namespace_ == other.namespace_ && type == other.type; } - return libtextclassifier3::Status::OK; -} +}; + +struct NamespaceTypePairHasher { + std::size_t operator()(const NamespaceTypePair& pair) const { + return std::hash<std::string>()(pair.namespace_) ^ + std::hash<std::string>()(pair.type); + } +}; libtextclassifier3::Status ValidateResultSpec( const ResultSpecProto& result_spec) { @@ -142,6 +149,11 @@ libtextclassifier3::Status ValidateSuggestionSpec( return absl_ports::InvalidArgumentError( absl_ports::StrCat("SuggestionSpecProto.prefix is empty!")); } + if (suggestion_spec.scoring_spec().scoring_match_type() == + TermMatchType::UNKNOWN) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!")); + } if (suggestion_spec.num_to_return() <= 0) { return absl_ports::InvalidArgumentError(absl_ports::StrCat( "SuggestionSpecProto.num_to_return must be positive.")); @@ -261,6 +273,28 @@ void TransformStatus(const libtextclassifier3::Status& internal_status, status_proto->set_message(internal_status.error_message()); } +libtextclassifier3::Status RetrieveAndAddDocumentInfo( + const DocumentStore* document_store, DeleteByQueryResultProto& result_proto, + std::unordered_map<NamespaceTypePair, + DeleteByQueryResultProto::DocumentGroupInfo*, + NamespaceTypePairHasher>& info_map, + DocumentId document_id) { + ICING_ASSIGN_OR_RETURN(DocumentProto document, + document_store->Get(document_id)); + NamespaceTypePair key = {document.namespace_(), document.schema()}; + auto iter = info_map.find(key); + if (iter == info_map.end()) { + auto entry = result_proto.add_deleted_documents(); + entry->set_namespace_(std::move(document.namespace_())); + entry->set_schema(std::move(document.schema())); + entry->add_uris(std::move(document.uri())); + info_map[key] = entry; + } else { + iter->second->add_uris(std::move(document.uri())); + } + return libtextclassifier3::Status::OK; +} + } // namespace IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options, @@ -399,7 +433,6 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() { libtextclassifier3::Status IcingSearchEngine::InitializeMembers( InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); - ICING_RETURN_IF_ERROR(ValidateOptions(options_)); // Make sure the base directory exists if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) { @@ -450,8 +483,6 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( // last tried to set the schema. ICING_RETURN_IF_ERROR(InitializeDocumentStore( /*force_recovery_and_revalidate_documents=*/true, initialize_stats)); - initialize_stats->set_document_store_recovery_cause( - InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); // We're going to need to build the index from scratch. So just delete its // files now. @@ -941,7 +972,7 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space, delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = document_store_->Delete(name_space, uri); if (!status.ok()) { @@ -975,7 +1006,7 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace( delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. DocumentStore::DeleteByGroupResult doc_store_result = document_store_->DeleteByNamespace(name_space); @@ -1009,7 +1040,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. DocumentStore::DeleteByGroupResult doc_store_result = document_store_->DeleteBySchemaType(schema_type); @@ -1027,7 +1058,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( } DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( - const SearchSpecProto& search_spec) { + const SearchSpecProto& search_spec, bool return_deleted_document_info) { ICING_VLOG(1) << "Deleting documents for query " << search_spec.query() << " from doc store"; @@ -1081,12 +1112,27 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( ICING_VLOG(2) << "Deleting the docs that matched the query."; int num_deleted = 0; + // A map used to group deleted documents. + // From the (namespace, type) pair to a list of uris. + std::unordered_map<NamespaceTypePair, + DeleteByQueryResultProto::DocumentGroupInfo*, + NamespaceTypePairHasher> + deleted_info_map; component_timer = clock_->GetNewTimer(); while (query_results.root_iterator->Advance().ok()) { ICING_VLOG(3) << "Deleting doc " << query_results.root_iterator->doc_hit_info().document_id(); ++num_deleted; + if (return_deleted_document_info) { + status = RetrieveAndAddDocumentInfo( + document_store_.get(), result_proto, deleted_info_map, + query_results.root_iterator->doc_hit_info().document_id()); + if (!status.ok()) { + TransformStatus(status, result_status); + return result_proto; + } + } status = document_store_->Delete( query_results.root_iterator->doc_hit_info().document_id()); if (!status.ok()) { @@ -1155,12 +1201,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() { std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer(); OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats(); int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - if (before_size != Filesystem::kBadFileSize) { - optimize_stats->set_storage_size_before(before_size); - } else { - // Set -1 as a sentinel value when failures occur. - optimize_stats->set_storage_size_before(-1); - } + optimize_stats->set_storage_size_before( + Filesystem::SanitizeFileSize(before_size)); // Flushes data to disk before doing optimization auto status = InternalPersistToDisk(PersistType::FULL); @@ -1237,12 +1279,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() { optimize_status_file.Write(std::move(optimize_status)); int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - if (after_size != Filesystem::kBadFileSize) { - optimize_stats->set_storage_size_after(after_size); - } else { - // Set -1 as a sentinel value when failures occur. - optimize_stats->set_storage_size_after(-1); - } + optimize_stats->set_storage_size_after( + Filesystem::SanitizeFileSize(after_size)); optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds()); TransformStatus(optimization_status, result_status); @@ -1324,11 +1362,8 @@ StorageInfoResultProto IcingSearchEngine::GetStorageInfo() { } int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - if (index_size != Filesystem::kBadFileSize) { - result.mutable_storage_info()->set_total_storage_size(index_size); - } else { - result.mutable_storage_info()->set_total_storage_size(-1); - } + result.mutable_storage_info()->set_total_storage_size( + Filesystem::SanitizeFileSize(index_size)); *result.mutable_storage_info()->mutable_document_storage_info() = document_store_->GetStorageInfo(); *result.mutable_storage_info()->mutable_schema_store_storage_info() = @@ -1875,19 +1910,22 @@ SuggestionResponse IcingSearchEngine::SearchSuggestions( std::unique_ptr<SuggestionProcessor> suggestion_processor = std::move(suggestion_processor_or).ValueOrDie(); - std::vector<NamespaceId> namespace_ids; + std::unordered_set<NamespaceId> namespace_ids; namespace_ids.reserve(suggestion_spec.namespace_filters_size()); for (std::string_view name_space : suggestion_spec.namespace_filters()) { auto namespace_id_or = document_store_->GetNamespaceId(name_space); if (!namespace_id_or.ok()) { continue; } - namespace_ids.push_back(namespace_id_or.ValueOrDie()); + namespace_ids.insert(namespace_id_or.ValueOrDie()); } // Run suggestion based on given SuggestionSpec. + NamespaceCheckerImpl namespace_checker_impl(document_store_.get(), + std::move(namespace_ids)); libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or = - suggestion_processor->QuerySuggestions(suggestion_spec, namespace_ids); + suggestion_processor->QuerySuggestions(suggestion_spec, + &namespace_checker_impl); if (!terms_or.ok()) { TransformStatus(terms_or.status(), response_status); return response; diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index 0a79714..ff9c7fb 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -280,8 +280,9 @@ class IcingSearchEngine { // NOT_FOUND if the query doesn't match any documents // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL_ERROR on IO error - DeleteByQueryResultProto DeleteByQuery(const SearchSpecProto& search_spec) - ICING_LOCKS_EXCLUDED(mutex_); + DeleteByQueryResultProto DeleteByQuery( + const SearchSpecProto& search_spec, + bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_); // Retrieves, scores, ranks, and returns the results according to the specs. // Results can be empty. If there're multiple pages of results, diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc index 2d07e37..bf486da 100644 --- a/icing/icing-search-engine_fuzz_test.cc +++ b/icing/icing-search-engine_fuzz_test.cc @@ -18,12 +18,12 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/document-builder.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/icing-search-engine.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" #include "icing/proto/scoring.pb.h" #include "icing/schema-builder.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -31,13 +31,13 @@ namespace icing { namespace lib { namespace { -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = - PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = + PropertyConfigProto::Cardinality::REQUIRED; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; IcingSearchEngineOptions Setup() { IcingSearchEngineOptions icing_options; diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index b5206cd..7ed8885 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -27,7 +27,6 @@ #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" @@ -46,6 +45,7 @@ #include "icing/store/document-log-creator.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/random-string.h" #include "icing/testing/snippet-helpers.h" @@ -90,24 +90,24 @@ constexpr std::string_view kIpsumText = "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh " "placerat semper."; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = - PropertyConfigProto_Cardinality_Code_REQUIRED; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = - PropertyConfigProto_Cardinality_Code_REPEATED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = + PropertyConfigProto::Cardinality::REQUIRED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = - StringIndexingConfig_TokenizerType_Code_NONE; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE = + StringIndexingConfig::TokenizerType::NONE; #ifndef ICING_JNI_TEST -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; #endif // !ICING_JNI_TEST -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; -constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; +constexpr TermMatchType::Code MATCH_NONE = TermMatchType::UNKNOWN; PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader( Filesystem filesystem, const std::string& file_path) { @@ -362,36 +362,6 @@ TEST_F(IcingSearchEngineTest, GoodIndexMergeSizeReturnsOk) { EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); } -TEST_F(IcingSearchEngineTest, - NegativeMaxTokensPerDocSizeReturnsInvalidArgument) { - IcingSearchEngineOptions options = GetDefaultIcingOptions(); - options.set_max_tokens_per_doc(-1); - IcingSearchEngine icing(options, GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), - ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); -} - -TEST_F(IcingSearchEngineTest, ZeroMaxTokensPerDocSizeReturnsInvalidArgument) { - IcingSearchEngineOptions options = GetDefaultIcingOptions(); - options.set_max_tokens_per_doc(0); - IcingSearchEngine icing(options, GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), - ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); -} - -TEST_F(IcingSearchEngineTest, GoodMaxTokensPerDocSizeReturnsOk) { - IcingSearchEngineOptions options = GetDefaultIcingOptions(); - // INT_MAX is valid - it just means that we shouldn't limit the number of - // tokens per document. It would be pretty inconceivable that anyone would - // produce such a document - the text being indexed alone would take up at - // least ~4.3 GiB! - and the document would be rejected before indexing - // for exceeding max_document_size, but there's no reason to explicitly - // bar it. - options.set_max_tokens_per_doc(std::numeric_limits<int32_t>::max()); - IcingSearchEngine icing(options, GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); -} - TEST_F(IcingSearchEngineTest, NegativeMaxTokenLenReturnsInvalidArgument) { IcingSearchEngineOptions options = GetDefaultIcingOptions(); options.set_max_token_length(-1); @@ -2198,7 +2168,7 @@ TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) { search_spec.set_query("message"); ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_max_window_bytes(64); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(1); result_spec.mutable_snippet_spec()->set_num_to_snippet(1); @@ -2616,7 +2586,7 @@ TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) { ResultSpecProto result_spec; result_spec.set_num_per_page(2); - result_spec.mutable_snippet_spec()->set_max_window_bytes(64); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(1); result_spec.mutable_snippet_spec()->set_num_to_snippet(3); @@ -3523,6 +3493,105 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) { expected_search_result_proto)); } +TEST_F(IcingSearchEngineTest, DeleteByQueryReturnInfo) { + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document3 = + DocumentBuilder() + .SetKey("namespace2", "uri3") + .SetSchema("Message") + .AddStringProperty("body", "message body3") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(7); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document1; + EXPECT_THAT( + icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document2; + EXPECT_THAT( + icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + + *expected_get_result_proto.mutable_document() = document3; + EXPECT_THAT( + icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + + // Delete all docs to test the information is correctly grouped. + SearchSpecProto search_spec; + search_spec.set_query("message"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + DeleteByQueryResultProto result_proto = + icing.DeleteByQuery(search_spec, true); + EXPECT_THAT(result_proto.status(), ProtoIsOk()); + DeleteByQueryStatsProto exp_stats; + exp_stats.set_latency_ms(7); + exp_stats.set_num_documents_deleted(3); + exp_stats.set_query_length(search_spec.query().length()); + exp_stats.set_num_terms(1); + exp_stats.set_num_namespaces_filtered(0); + exp_stats.set_num_schema_types_filtered(0); + exp_stats.set_parse_query_latency_ms(7); + exp_stats.set_document_removal_latency_ms(7); + EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats)); + + // Check that DeleteByQuery can return information for deleted documents. + DeleteByQueryResultProto::DocumentGroupInfo info1, info2; + info1.set_namespace_("namespace1"); + info1.set_schema("Message"); + info1.add_uris("uri1"); + info2.set_namespace_("namespace2"); + info2.set_schema("Message"); + info2.add_uris("uri3"); + info2.add_uris("uri2"); + EXPECT_THAT(result_proto.deleted_documents(), + UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2))); + + EXPECT_THAT( + icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()) + .status() + .code(), + Eq(StatusProto::NOT_FOUND)); + EXPECT_THAT( + icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()) + .status() + .code(), + Eq(StatusProto::NOT_FOUND)); + EXPECT_THAT( + icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()) + .status() + .code(), + Eq(StatusProto::NOT_FOUND)); +} + TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) { DocumentProto document1 = DocumentBuilder() @@ -6048,7 +6117,7 @@ TEST_F(IcingSearchEngineTest, SnippetNormalization) { search_spec.set_query("mdi Zürich"); ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_max_window_bytes(64); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(2); result_spec.mutable_snippet_spec()->set_num_to_snippet(2); @@ -6111,7 +6180,7 @@ TEST_F(IcingSearchEngineTest, SnippetNormalizationPrefix) { search_spec.set_query("md Zür"); ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_max_window_bytes(64); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(2); result_spec.mutable_snippet_spec()->set_num_to_snippet(2); @@ -6166,7 +6235,7 @@ TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) { search_spec.set_query("body:Zür"); ResultSpecProto result_spec; - result_spec.mutable_snippet_spec()->set_max_window_bytes(64); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(10); result_spec.mutable_snippet_spec()->set_num_to_snippet(10); @@ -7694,7 +7763,7 @@ TEST_F(IcingSearchEngineTest, QueryStatsProtoTest) { ResultSpecProto result_spec; result_spec.set_num_per_page(2); - result_spec.mutable_snippet_spec()->set_max_window_bytes(64); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64); result_spec.mutable_snippet_spec()->set_num_matches_per_property(1); result_spec.mutable_snippet_spec()->set_num_to_snippet(3); @@ -7905,7 +7974,7 @@ TEST_F(IcingSearchEngineTest, SnippetErrorTest) { ResultSpecProto result_spec; result_spec.mutable_snippet_spec()->set_num_to_snippet(2); result_spec.mutable_snippet_spec()->set_num_matches_per_property(3); - result_spec.mutable_snippet_spec()->set_max_window_bytes(4); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(4); SearchResultProto search_results = icing.Search(search_spec, scoring_spec, result_spec); @@ -8110,6 +8179,8 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest) { SuggestionSpecProto suggestion_spec; suggestion_spec.set_prefix("t"); suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); // Query all suggestions, and they will be ranked. SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); @@ -8130,6 +8201,316 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest) { ASSERT_THAT(response.suggestions().at(2).query(), "termfour"); } +TEST_F(IcingSearchEngineTest, + SearchSuggestionsTest_ShouldReturnInOneNamespace) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "foo fool") + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "fool") + .Build(); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + + SuggestionResponse::Suggestion suggestionFoo; + suggestionFoo.set_query("foo"); + SuggestionResponse::Suggestion suggestionFool; + suggestionFool.set_query("fool"); + + // namespace1 has 2 results. + SuggestionSpecProto suggestion_spec; + suggestion_spec.set_prefix("f"); + suggestion_spec.add_namespace_filters("namespace1"); + suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); + + SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + UnorderedElementsAre(EqualsProto(suggestionFoo), + EqualsProto(suggestionFool))); +} + +TEST_F(IcingSearchEngineTest, + SearchSuggestionsTest_ShouldReturnInMultipleNamespace) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "fo") + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "foo") + .Build(); + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace3", "uri3") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "fool") + .Build(); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + + SuggestionResponse::Suggestion suggestionFoo; + suggestionFoo.set_query("foo"); + SuggestionResponse::Suggestion suggestionFool; + suggestionFool.set_query("fool"); + + // namespace2 and namespace3 has 2 results. + SuggestionSpecProto suggestion_spec; + suggestion_spec.set_prefix("f"); + suggestion_spec.add_namespace_filters("namespace2"); + suggestion_spec.add_namespace_filters("namespace3"); + suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); + + SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + UnorderedElementsAre(EqualsProto(suggestionFoo), + EqualsProto(suggestionFool))); +} + +TEST_F(IcingSearchEngineTest, + SearchSuggestionsTest_OtherNamespaceDontContributeToHitCount) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + // Index 4 documents, + // namespace1 has 2 hit2 for term one + // namespace2 has 2 hit2 for term two and 1 hit for term one. + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "termone") + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "termone") + .Build(); + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "termone termtwo") + .Build(); + DocumentProto document4 = DocumentBuilder() + .SetKey("namespace2", "uri3") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "termtwo") + .Build(); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk()); + + SuggestionResponse::Suggestion suggestionTermOne; + suggestionTermOne.set_query("termone"); + SuggestionResponse::Suggestion suggestionTermTwo; + suggestionTermTwo.set_query("termtwo"); + + // only search suggestion for namespace2. The correctly order should be + // {"termtwo", "termone"}. If we're not filtering out namespace1 when + // calculating our score, then it will be {"termone", "termtwo"}. + SuggestionSpecProto suggestion_spec; + suggestion_spec.set_prefix("t"); + suggestion_spec.add_namespace_filters("namespace2"); + suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); + + SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + ElementsAre(EqualsProto(suggestionTermTwo), + EqualsProto(suggestionTermOne))); +} + +TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_DeletionTest) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "fool") + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "fool") + .Build(); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + + SuggestionResponse::Suggestion suggestionFool; + suggestionFool.set_query("fool"); + + // namespace1 has this suggestion + SuggestionSpecProto suggestion_spec; + suggestion_spec.set_prefix("f"); + suggestion_spec.add_namespace_filters("namespace1"); + suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); + + SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + UnorderedElementsAre(EqualsProto(suggestionFool))); + + // namespace2 has this suggestion + suggestion_spec.clear_namespace_filters(); + suggestion_spec.add_namespace_filters("namespace2"); + response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + UnorderedElementsAre(EqualsProto(suggestionFool))); + + // delete document from namespace 1 + EXPECT_THAT(icing.Delete("namespace1", "uri1").status(), ProtoIsOk()); + + // Now namespace1 will return empty + suggestion_spec.clear_namespace_filters(); + suggestion_spec.add_namespace_filters("namespace1"); + response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), IsEmpty()); + + // namespace2 still has this suggestion, so we can prove the reason of + // namespace 1 cannot find it is we filter it out, not it doesn't exist. + suggestion_spec.add_namespace_filters("namespace2"); + response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + UnorderedElementsAre(EqualsProto(suggestionFool))); +} + +TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_ExpiredTest) { + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(100) + .SetTtlMs(500) + .AddStringProperty("subject", "fool") + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(100) + .SetTtlMs(1000) + .AddStringProperty("subject", "fool") + .Build(); + { + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetSystemTimeMilliseconds(400); + + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + + SuggestionResponse::Suggestion suggestionFool; + suggestionFool.set_query("fool"); + + // namespace1 has this suggestion + SuggestionSpecProto suggestion_spec; + suggestion_spec.set_prefix("f"); + suggestion_spec.add_namespace_filters("namespace1"); + suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); + + SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + UnorderedElementsAre(EqualsProto(suggestionFool))); + + // namespace2 has this suggestion + suggestion_spec.clear_namespace_filters(); + suggestion_spec.add_namespace_filters("namespace2"); + response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + UnorderedElementsAre(EqualsProto(suggestionFool))); + } + // We reinitialize here so we can feed in a fake clock this time + { + // Time needs to be past document1 creation time (100) + ttl (500) for it + // to count as "expired". document2 is not expired since its ttl is 1000. + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetSystemTimeMilliseconds(800); + + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SuggestionSpecProto suggestion_spec; + suggestion_spec.set_prefix("f"); + suggestion_spec.add_namespace_filters("namespace1"); + suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); + + // Now namespace1 will return empty + suggestion_spec.clear_namespace_filters(); + suggestion_spec.add_namespace_filters("namespace1"); + SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), IsEmpty()); + + // namespace2 still has this suggestion + SuggestionResponse::Suggestion suggestionFool; + suggestionFool.set_query("fool"); + + suggestion_spec.add_namespace_filters("namespace2"); + response = icing.SearchSuggestions(suggestion_spec); + ASSERT_THAT(response.status(), ProtoIsOk()); + ASSERT_THAT(response.suggestions(), + UnorderedElementsAre(EqualsProto(suggestionFool))); + } +} + TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_emptyPrefix) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -8137,6 +8518,8 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_emptyPrefix) { SuggestionSpecProto suggestion_spec; suggestion_spec.set_prefix(""); suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(), ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); @@ -8149,6 +8532,8 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_NonPositiveNumToReturn) { SuggestionSpecProto suggestion_spec; suggestion_spec.set_prefix("prefix"); suggestion_spec.set_num_to_return(0); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(), ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); @@ -8203,7 +8588,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) { EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), Eq(InitializeStatsProto::NO_DATA_LOSS)); EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), - Eq(InitializeStatsProto::NONE)); + Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT)); EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), Eq(InitializeStatsProto::NONE)); EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index 1aae732..207c033 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -73,9 +73,23 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( section.metadata.term_match_type, /*namespace_id=*/0); for (std::string_view token : section.token_sequence) { ++num_tokens; - std::string term = normalizer_.NormalizeTerm(token); - // Add this term to Hit buffer. - status = editor.BufferTerm(term.c_str()); + + switch (section.metadata.tokenizer) { + case StringIndexingConfig::TokenizerType::VERBATIM: + // data() is safe to use here because a token created from the + // VERBATIM tokenizer is the entire string value. The character at + // data() + token.length() is guaranteed to be a null char. + status = editor.BufferTerm(token.data()); + break; + case StringIndexingConfig::TokenizerType::NONE: + ICING_LOG(WARNING) + << "Unexpected TokenizerType::NONE found when indexing document."; + [[fallthrough]]; + case StringIndexingConfig::TokenizerType::PLAIN: + std::string normalized_term = normalizer_.NormalizeTerm(token); + status = editor.BufferTerm(normalized_term.c_str()); + } + if (!status.ok()) { // We've encountered a failure. Bail out. We'll mark this doc as deleted // and signal a failure to the client. diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h index c4b77b5..269e41c 100644 --- a/icing/index/index-processor.h +++ b/icing/index/index-processor.h @@ -69,8 +69,6 @@ class IndexProcessor { IndexProcessor(const Normalizer* normalizer, Index* index, const Clock* clock) : normalizer_(*normalizer), index_(index), clock_(*clock) {} - std::string NormalizeToken(const Token& token); - const Normalizer& normalizer_; Index* const index_; const Clock& clock_; diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 6e072c7..1aad7d0 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -16,7 +16,6 @@ #include "gmock/gmock.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" #include "icing/legacy/core/icing-string-util.h" @@ -24,6 +23,7 @@ #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 449bc3e..bd310de 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -30,7 +30,6 @@ #include "icing/absl_ports/str_join.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -49,6 +48,7 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/random-string.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -90,6 +90,8 @@ constexpr std::string_view kRepeatedProperty = "repeated"; constexpr std::string_view kSubProperty = "submessage"; constexpr std::string_view kNestedType = "NestedType"; constexpr std::string_view kNestedProperty = "nested"; +constexpr std::string_view kExactVerbatimProperty = "verbatimExact"; +constexpr std::string_view kPrefixedVerbatimProperty = "verbatimPrefixed"; constexpr DocumentId kDocumentId0 = 0; constexpr DocumentId kDocumentId1 = 1; @@ -98,6 +100,8 @@ constexpr SectionId kExactSectionId = 0; constexpr SectionId kPrefixedSectionId = 1; constexpr SectionId kRepeatedSectionId = 2; constexpr SectionId kNestedSectionId = 3; +constexpr SectionId kExactVerbatimSectionId = 4; +constexpr SectionId kPrefixedVerbatimSectionId = 5; using Cardinality = PropertyConfigProto::Cardinality; using DataType = PropertyConfigProto::DataType; @@ -106,21 +110,23 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::Test; -constexpr PropertyConfigProto_DataType_Code TYPE_STRING = - PropertyConfigProto_DataType_Code_STRING; -constexpr PropertyConfigProto_DataType_Code TYPE_BYTES = - PropertyConfigProto_DataType_Code_BYTES; +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto::DataType::Code TYPE_BYTES = + PropertyConfigProto::DataType::BYTES; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = - PropertyConfigProto_Cardinality_Code_REPEATED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = + StringIndexingConfig::TokenizerType::VERBATIM; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; class IndexProcessorTest : public Test { protected: @@ -180,6 +186,16 @@ class IndexProcessorTest : public Test { .SetCardinality(CARDINALITY_REPEATED)) .AddProperty( PropertyConfigBuilder() + .SetName(kExactVerbatimProperty) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPrefixedVerbatimProperty) + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() .SetName(kSubProperty) .SetDataTypeDocument( kNestedType, /*index_nested_properties=*/true) @@ -797,6 +813,95 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); } +TEST_F(IndexProcessorTest, ExactVerbatimProperty) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kExactVerbatimProperty), + "Hello, world!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 1); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("Hello, world!", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); + std::vector<DocHitInfo> hits = GetHits(std::move(itr)); + std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ + {kExactVerbatimSectionId, 1}}; + + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expectedMap))); +} + +TEST_F(IndexProcessorTest, PrefixVerbatimProperty) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPrefixedVerbatimProperty), + "Hello, world!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 1); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + // We expect to match the document we indexed as "Hello, w" is a prefix + // of "Hello, world!" + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("Hello, w", kSectionIdMaskAll, + TermMatchType::PREFIX)); + std::vector<DocHitInfo> hits = GetHits(std::move(itr)); + std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ + {kPrefixedVerbatimSectionId, 1}}; + + EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( + kDocumentId0, expectedMap))); +} + +TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPrefixedVerbatimProperty), + "Hello, world!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(tokenized_document.num_tokens(), 1); + + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> itr, + index_->GetIterator("world", kSectionIdMaskAll, TermMatchType::PREFIX)); + std::vector<DocHitInfo> hits = GetHits(std::move(itr)); + + // We should not have hits for term "world" as the index processor should + // create a sole token "Hello, world! for the document. + EXPECT_THAT(hits, IsEmpty()); +} + } // namespace } // namespace lib diff --git a/icing/index/index.cc b/icing/index/index.cc index 1bdab21..02ba699 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -71,24 +71,6 @@ IcingDynamicTrie::Options GetMainLexiconOptions() { return IcingDynamicTrie::Options(); } -// Helper function to check if a term is in the given namespaces. -// TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty(). -bool IsTermInNamespaces( - const IcingDynamicTrie::PropertyReadersAll& property_reader, - uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) { - if (namespace_ids.empty()) { - return true; - } - for (NamespaceId namespace_id : namespace_ids) { - if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id), - value_index)) { - return true; - } - } - - return false; -} - enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms }; // Merge the TermMetadata from lite index and main index. If the term exists in @@ -137,7 +119,7 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas( int total_est_hit_count = lite_term_itr->hit_count + main_term_itr->hit_count; PushToTermHeap(TermMetadata(std::move(lite_term_itr->content), - total_est_hit_count), + total_est_hit_count), num_to_return, merged_term_metadata_heap); ++lite_term_itr; ++main_term_itr; @@ -228,32 +210,26 @@ Index::GetIterator(const std::string& term, SectionIdMask section_id_mask, libtextclassifier3::StatusOr<std::vector<TermMetadata>> Index::FindLiteTermsByPrefix(const std::string& prefix, - const std::vector<NamespaceId>& namespace_ids) { + const NamespaceChecker* namespace_checker) { // Finds all the terms that start with the given prefix in the lexicon. IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(), prefix.c_str()); - // A property reader to help check if a term has some property. - IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon()); - std::vector<TermMetadata> term_metadata_list; while (term_iterator.IsValid()) { uint32_t term_value_index = term_iterator.GetValueIndex(); - // Skips the terms that don't exist in the given namespaces. We won't skip - // any terms if namespace_ids is empty. - if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) { - term_iterator.Advance(); - continue; - } - ICING_ASSIGN_OR_RETURN( uint32_t term_id, term_id_codec_->EncodeTvi(term_value_index, TviType::LITE), absl_ports::InternalError("Failed to access terms in lexicon.")); - - term_metadata_list.emplace_back(term_iterator.GetKey(), - lite_index_->CountHits(term_id)); + ICING_ASSIGN_OR_RETURN(int hit_count, + lite_index_->CountHits(term_id, namespace_checker)); + if (hit_count > 0) { + // There is at least one document in the given namespace has this term. + term_metadata_list.push_back( + TermMetadata(term_iterator.GetKey(), hit_count)); + } term_iterator.Advance(); } @@ -261,21 +237,20 @@ Index::FindLiteTermsByPrefix(const std::string& prefix, } libtextclassifier3::StatusOr<std::vector<TermMetadata>> -Index::FindTermsByPrefix(const std::string& prefix, - const std::vector<NamespaceId>& namespace_ids, - int num_to_return) { +Index::FindTermsByPrefix(const std::string& prefix, int num_to_return, + TermMatchType::Code term_match_type, + const NamespaceChecker* namespace_checker) { std::vector<TermMetadata> term_metadata_list; if (num_to_return <= 0) { return term_metadata_list; } - // Get results from the LiteIndex. ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> lite_term_metadata_list, - FindLiteTermsByPrefix(prefix, namespace_ids)); + FindLiteTermsByPrefix(prefix, namespace_checker)); // Append results from the MainIndex. ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> main_term_metadata_list, - main_index_->FindTermsByPrefix(prefix, namespace_ids)); - + main_index_->FindTermsByPrefix(prefix, term_match_type, + namespace_checker)); return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list), std::move(main_term_metadata_list), num_to_return); @@ -284,11 +259,7 @@ Index::FindTermsByPrefix(const std::string& prefix, IndexStorageInfoProto Index::GetStorageInfo() const { IndexStorageInfoProto storage_info; int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str()); - if (directory_size != Filesystem::kBadFileSize) { - storage_info.set_index_size(directory_size); - } else { - storage_info.set_index_size(-1); - } + storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size)); storage_info = lite_index_->GetStorageInfo(std::move(storage_info)); return main_index_->GetStorageInfo(std::move(storage_info)); } diff --git a/icing/index/index.h b/icing/index/index.h index 693cf04..5c53349 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -32,10 +32,12 @@ #include "icing/index/term-id-codec.h" #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/crc32.h" @@ -142,9 +144,14 @@ class Index { // index. // verbosity > 0, more detailed debug information including raw postings // lists. - void GetDebugInfo(int verbosity, std::string* out) const { - lite_index_->GetDebugInfo(verbosity, out); - main_index_->GetDebugInfo(verbosity, out); + IndexDebugInfoProto GetDebugInfo(int verbosity) const { + IndexDebugInfoProto debug_info; + *debug_info.mutable_index_storage_info() = GetStorageInfo(); + *debug_info.mutable_lite_index_info() = + lite_index_->GetDebugInfo(verbosity); + *debug_info.mutable_main_index_info() = + main_index_->GetDebugInfo(verbosity); + return debug_info; } // Returns the byte size of the all the elements held in the index. This @@ -181,17 +188,17 @@ class Index { TermMatchType::Code term_match_type); // Finds terms with the given prefix in the given namespaces. If - // 'namespace_ids' is empty, returns results from all the namespaces. The - // input prefix must be normalized, otherwise inaccurate results may be - // returned. Results are not sorted specifically and are in their original - // order. Number of results are no more than 'num_to_return'. + // 'namespace_ids' is empty, returns results from all the namespaces. Results + // are sorted in decreasing order of hit count. Number of results are no more + // than 'num_to_return'. // // Returns: // A list of TermMetadata on success // INTERNAL_ERROR if failed to access term data. libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( - const std::string& prefix, const std::vector<NamespaceId>& namespace_ids, - int num_to_return); + const std::string& prefix, int num_to_return, + TermMatchType::Code term_match_type, + const NamespaceChecker* namespace_checker); // A class that can be used to add hits to the index. // @@ -267,7 +274,7 @@ class Index { filesystem_(filesystem) {} libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix( - const std::string& prefix, const std::vector<NamespaceId>& namespace_ids); + const std::string& prefix, const NamespaceChecker* namespace_checker); std::unique_ptr<LiteIndex> lite_index_; std::unique_ptr<MainIndex> main_index_; diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index 00d5ad6..8355c01 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -31,10 +31,12 @@ #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/testing/always-true-namespace-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/random-string.h" #include "icing/testing/tmp-directory.h" @@ -89,22 +91,9 @@ constexpr DocumentId kDocumentId5 = 5; constexpr DocumentId kDocumentId6 = 6; constexpr DocumentId kDocumentId7 = 7; constexpr DocumentId kDocumentId8 = 8; -constexpr DocumentId kDocumentId9 = 9; -constexpr DocumentId kDocumentId10 = 10; -constexpr DocumentId kDocumentId11 = 11; -constexpr DocumentId kDocumentId12 = 12; constexpr SectionId kSectionId2 = 2; constexpr SectionId kSectionId3 = 3; -// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock( -// GetBlockSize(), -// GetPostingListIndexBits(posting_list_utils::min_posting_list_size())); -constexpr int kMinSizePlApproxHits = 3; -// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock( -// GetBlockSize(), -// GetPostingListIndexBits(2 * posting_list_utils::min_posting_list_size())); -constexpr int kSecondSmallestPlApproxHits = 7; - std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { std::vector<DocHitInfo> infos; while (iterator->Advance().ok()) { @@ -920,148 +909,82 @@ TEST_F(IndexTest, InvalidHitBufferSize) { TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, - /*num_to_return=*/0), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*num_to_return=*/0, + TermMatchType::PREFIX, &impl), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, - /*num_to_return=*/-1), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", + /*num_to_return=*/-1, + TermMatchType::PREFIX, &impl), IsOkAndHolds(IsEmpty())); ICING_ASSERT_OK(index_->Merge()); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, - /*num_to_return=*/0), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", + /*num_to_return=*/0, + TermMatchType::PREFIX, &impl), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0}, - /*num_to_return=*/-1), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", + /*num_to_return=*/-1, + TermMatchType::PREFIX, &impl), IsOkAndHolds(IsEmpty())); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // "b" should only match "bar" but not "foo". - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1)))); ICING_ASSERT_OK(index_->Merge()); // "b" should only match "bar" but not "foo". - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("bar", kMinSizePlApproxHits)))); + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) { Index::Editor edit = index_->Edit( kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("fo"), IsOk()); EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // We have 3 results but only 2 should be returned. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/2), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/2, + TermMatchType::PREFIX, &impl), IsOkAndHolds(SizeIs(2))); ICING_ASSERT_OK(index_->Merge()); // We have 3 results but only 2 should be returned. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/2), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/2, + TermMatchType::PREFIX, &impl), IsOkAndHolds(SizeIs(2))); } -TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) { - Index::Editor edit1 = - index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); - EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit2 = - index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/1); - EXPECT_THAT(edit2.BufferTerm("fool"), IsOk()); - EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - - // namespace with id 0 has 2 results. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1), - EqualsTermMetadata("foo", 1)))); - // namespace with id 1 has 1 result. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1)))); - - ICING_ASSERT_OK(index_->Merge()); - - // namespace with id 0 has 2 results. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fo", kMinSizePlApproxHits), - EqualsTermMetadata("foo", kMinSizePlApproxHits)))); - // namespace with id 1 has 1 result. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fool", kMinSizePlApproxHits)))); -} - -TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) { - Index::Editor edit1 = - index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); - EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit2 = - index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/1); - EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - - Index::Editor edit3 = - index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/2); - EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); - EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); - - // Should return "foo" and "fool" which are in namespaces with ids 1 and 2. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), - EqualsTermMetadata("fool", 1)))); - - ICING_ASSERT_OK(index_->Merge()); - - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", kMinSizePlApproxHits)))); -} - TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); @@ -1078,8 +1001,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); // Should return "fo", "foo" and "fool" across all namespaces. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(UnorderedElementsAre( EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), EqualsTermMetadata("fool", 1)))); @@ -1087,18 +1011,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) { ICING_ASSERT_OK(index_->Merge()); // Should return "fo", "foo" and "fool" across all namespaces. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("fo", kMinSizePlApproxHits), - EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", kMinSizePlApproxHits)))); + EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit1.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit1.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); @@ -1110,20 +1035,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) { EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit, 'fool' has 2 hits. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), EqualsTermMetadata("foo", 1)))); ICING_ASSERT_OK(index_->Merge()); - // foo's one hit should fit on a min-sized pl, fool's two hits should also fit - // on a min-sized pl. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", kMinSizePlApproxHits)))); + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), + EqualsTermMetadata("foo", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { @@ -1132,6 +1056,7 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { Index::Editor edit1 = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit1.BufferTerm("term-one"), IsOk()); EXPECT_THAT(edit1.BufferTerm("term-two"), IsOk()); EXPECT_THAT(edit1.BufferTerm("term-three"), IsOk()); @@ -1181,8 +1106,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { EXPECT_THAT(edit6.IndexAllBufferedTerms(), IsOk()); // verify the order in lite index is correct. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6), EqualsTermMetadata("term-five", 5), EqualsTermMetadata("term-four", 4), @@ -1192,93 +1118,97 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) { ICING_ASSERT_OK(index_->Merge()); - // Since most of term has same approx hit count, we don't verify order in the - // main index. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits), - EqualsTermMetadata("term-five", kSecondSmallestPlApproxHits), - EqualsTermMetadata("term-four", kMinSizePlApproxHits), - EqualsTermMetadata("term-three", kMinSizePlApproxHits), - EqualsTermMetadata("term-two", kMinSizePlApproxHits), - EqualsTermMetadata("term-one", kMinSizePlApproxHits)))); - - // keep push terms to the lite index. For term 1-4, since they has same hit - // count kMinSizePlApproxHits, we will push 4 term-one, 3 term-two, 2 - // term-three and one term-four to make them in reverse order. And for term - // 5 & 6, we will push 2 term-five and one term-six. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6), + EqualsTermMetadata("term-five", 5), + EqualsTermMetadata("term-four", 4), + EqualsTermMetadata("term-three", 3), + EqualsTermMetadata("term-two", 2), + EqualsTermMetadata("term-one", 1)))); + + // keep push terms to the lite index. We will add 2 document to term-five, + // term-three and term-one. The output order should be 5-6-3-4-1-2. Index::Editor edit7 = index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); EXPECT_THAT(edit7.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit7.BufferTerm("term-two"), IsOk()); EXPECT_THAT(edit7.BufferTerm("term-three"), IsOk()); - EXPECT_THAT(edit7.BufferTerm("term-four"), IsOk()); + EXPECT_THAT(edit7.BufferTerm("term-five"), IsOk()); EXPECT_THAT(edit7.IndexAllBufferedTerms(), IsOk()); Index::Editor edit8 = index_->Edit(kDocumentId8, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); EXPECT_THAT(edit8.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit8.BufferTerm("term-two"), IsOk()); EXPECT_THAT(edit8.BufferTerm("term-three"), IsOk()); + EXPECT_THAT(edit8.BufferTerm("term-five"), IsOk()); EXPECT_THAT(edit8.IndexAllBufferedTerms(), IsOk()); - Index::Editor edit9 = - index_->Edit(kDocumentId9, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit9.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit9.BufferTerm("term-two"), IsOk()); - EXPECT_THAT(edit9.IndexAllBufferedTerms(), IsOk()); + // verify the combination of lite index and main index is in correct order. + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"t", /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre( + EqualsTermMetadata("term-five", 7), EqualsTermMetadata("term-six", 6), + EqualsTermMetadata("term-three", 5), + EqualsTermMetadata("term-four", 4), EqualsTermMetadata("term-one", 3), + EqualsTermMetadata("term-two", 2)))); - Index::Editor edit10 = - index_->Edit(kDocumentId10, kSectionId2, TermMatchType::EXACT_ONLY, - /*namespace_id=*/0); - EXPECT_THAT(edit10.BufferTerm("term-one"), IsOk()); - EXPECT_THAT(edit10.IndexAllBufferedTerms(), IsOk()); + // Get the first three terms. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", + /*num_to_return=*/3, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-five", 7), + EqualsTermMetadata("term-six", 6), + EqualsTermMetadata("term-three", 5)))); +} - Index::Editor edit11 = - index_->Edit(kDocumentId11, kSectionId2, TermMatchType::EXACT_ONLY, +TEST_F(IndexTest, FindTermByPrefix_InTermMatchTypePrefix_ShouldReturnInOrder) { + Index::Editor edit1 = + index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit11.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit11.BufferTerm("term-six"), IsOk()); - EXPECT_THAT(edit11.IndexAllBufferedTerms(), IsOk()); + AlwaysTrueNamespaceCheckerImpl impl; + EXPECT_THAT(edit1.BufferTerm("fo"), IsOk()); + EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk()); - Index::Editor edit12 = - index_->Edit(kDocumentId12, kSectionId2, TermMatchType::EXACT_ONLY, + Index::Editor edit2 = + index_->Edit(kDocumentId2, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit12.BufferTerm("term-five"), IsOk()); - EXPECT_THAT(edit12.IndexAllBufferedTerms(), IsOk()); + EXPECT_THAT(edit2.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk()); - // verify the combination of lite index and main index is in correct order. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(ElementsAre( - EqualsTermMetadata("term-five", - kSecondSmallestPlApproxHits + 2), // 9 - EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits + 1), // 8 - EqualsTermMetadata("term-one", kMinSizePlApproxHits + 4), // 7 - EqualsTermMetadata("term-two", kMinSizePlApproxHits + 3), // 6 - EqualsTermMetadata("term-three", kMinSizePlApproxHits + 2), // 5 - EqualsTermMetadata("term-four", kMinSizePlApproxHits + 1)))); // 4 + Index::Editor edit3 = + index_->Edit(kDocumentId3, kSectionId2, TermMatchType::PREFIX, + /*namespace_id=*/0); + EXPECT_THAT(edit3.BufferTerm("fool"), IsOk()); + EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk()); - // Get the first three terms. - EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0}, - /*num_to_return=*/3), - IsOkAndHolds(ElementsAre( - EqualsTermMetadata("term-five", - kSecondSmallestPlApproxHits + 2), // 9 - EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits + 1), // 8 - EqualsTermMetadata("term-one", kMinSizePlApproxHits + 4)))); // 7 + ICING_ASSERT_OK(index_->Merge()); + // verify the order in pls is correct + // "fo" { {doc0, exact_hit}, {doc1, prefix_hit}, {doc2, prefix_hit} } + // "foo" { {doc1, exact_hit}, {doc2, prefix_hit} } + // "fool" { {doc2, exact_hit} } + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3), + EqualsTermMetadata("foo", 2), + EqualsTermMetadata("fool", 1)))); + // Find by exact only, all terms should be equally. + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, + TermMatchType::EXACT_ONLY, &impl), + IsOkAndHolds(UnorderedElementsAre( + EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); } -TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) { +TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1313,25 +1243,26 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) { EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); // 'foo' has 1 hit, 'fool' has 8 hits. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", + /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 8), EqualsTermMetadata("foo", 1)))); ICING_ASSERT_OK(index_->Merge()); - // foo's hits should fit on a single pl. fool's hits will need two pls. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(UnorderedElementsAre( - EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", kSecondSmallestPlApproxHits)))); + EXPECT_THAT( + index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 8)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1343,19 +1274,18 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) { EXPECT_THAT(edit.BufferTerm("fool"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the main index and - // 1 hit in the lite index. - EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(ElementsAre( - EqualsTermMetadata("fool", kMinSizePlApproxHits + 1), - EqualsTermMetadata("foo", kMinSizePlApproxHits)))); + EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2), + EqualsTermMetadata("foo", 1)))); } TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) { Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + AlwaysTrueNamespaceCheckerImpl impl; + EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); @@ -1368,10 +1298,10 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) { // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the lite index. EXPECT_THAT( - index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0}, - /*num_to_return=*/10), - IsOkAndHolds(ElementsAre(EqualsTermMetadata("foo", kMinSizePlApproxHits), - EqualsTermMetadata("fool", 1)))); + index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10, + TermMatchType::PREFIX, &impl), + IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1), + EqualsTermMetadata("fool", 1)))); } TEST_F(IndexTest, GetElementsSize) { @@ -1465,12 +1395,14 @@ TEST_F(IndexTest, GetDebugInfo) { EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId1); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK(index_->Merge()); edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId2); ASSERT_THAT(edit.BufferTerm("footer"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX, @@ -1478,40 +1410,45 @@ TEST_F(IndexTest, GetDebugInfo) { ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - std::string out0; - index_->GetDebugInfo(/*verbosity=*/0, &out0); - EXPECT_THAT(out0, Not(IsEmpty())); + IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info()); + EXPECT_THAT(out0.main_index_info().last_added_document_id(), + Eq(kDocumentId1)); + EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2)); + EXPECT_THAT(out0.lite_index_info().last_added_document_id(), + Eq(kDocumentId2)); - std::string out1; - index_->GetDebugInfo(/*verbosity=*/1, &out1); - EXPECT_THAT(out1, SizeIs(Gt(out0.size()))); + IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1); + EXPECT_THAT(out1.main_index_info().flash_index_storage_info(), + Not(IsEmpty())); // Add one more doc to the lite index. Debug strings should change. edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); + index_->set_last_added_document_id(kDocumentId3); ASSERT_THAT(edit.BufferTerm("far"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); - std::string out2; - index_->GetDebugInfo(/*verbosity=*/0, &out2); - EXPECT_THAT(out2, Ne(out0)); - - std::string out3; - index_->GetDebugInfo(/*verbosity=*/1, &out3); - EXPECT_THAT(out3, Ne(out1)); + IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3)); + EXPECT_THAT(out2.lite_index_info().last_added_document_id(), + Eq(kDocumentId3)); // Merge into the man index. Debuug strings should change again. ICING_ASSERT_OK(index_->Merge()); - std::string out4; - index_->GetDebugInfo(/*verbosity=*/0, &out4); - EXPECT_THAT(out4, Ne(out0)); - EXPECT_THAT(out4, Ne(out2)); - - std::string out5; - index_->GetDebugInfo(/*verbosity=*/1, &out5); - EXPECT_THAT(out5, Ne(out1)); - EXPECT_THAT(out5, Ne(out3)); + IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0); + EXPECT_TRUE(out3.has_index_storage_info()); + EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty())); + EXPECT_THAT(out3.main_index_info().last_added_document_id(), + Eq(kDocumentId3)); + EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0)); + EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0)); + EXPECT_THAT(out3.lite_index_info().last_added_document_id(), + Eq(kInvalidDocumentId)); + EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0)); + EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0)); + EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty())); } TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) { diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index 43a846b..7c6d924 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -48,13 +48,13 @@ using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::IsEmpty; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { protected: diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc index 08df4fc..f215d63 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -77,7 +77,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() { ICING_ASSIGN_OR_RETURN(uint32_t term_id, term_id_codec_->EncodeTvi(tvi, TviType::LITE)); lite_index_->AppendHits(term_id, section_restrict_mask_, - /*only_from_prefix_sections=*/false, &cached_hits_); + /*only_from_prefix_sections=*/false, + /*namespace_checker=*/nullptr, &cached_hits_); cached_hits_idx_ = 0; return libtextclassifier3::Status::OK; } @@ -100,7 +101,7 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() { term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE)); lite_index_->AppendHits(term_id, section_restrict_mask_, /*only_from_prefix_sections=*/!exact_match, - &cached_hits_); + /*namespace_checker=*/nullptr, &cached_hits_); ++terms_matched; } if (terms_matched > 1) { diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index 9e4ac28..a5c6baf 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -336,9 +336,12 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId( int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, + const NamespaceChecker* namespace_checker, std::vector<DocHitInfo>* hits_out) { int count = 0; DocumentId last_document_id = kInvalidDocumentId; + // Record whether the last document belongs to the given namespaces. + bool last_document_in_namespace = false; for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) { TermIdHitPair term_id_hit_pair( hit_buffer_.array_cast<TermIdHitPair>()[idx]); @@ -355,22 +358,31 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, } DocumentId document_id = hit.document_id(); if (document_id != last_document_id) { + last_document_id = document_id; + last_document_in_namespace = + namespace_checker == nullptr || + namespace_checker->BelongsToTargetNamespaces(document_id); + if (!last_document_in_namespace) { + // The document is removed or expired or not belongs to target + // namespaces. + continue; + } ++count; if (hits_out != nullptr) { hits_out->push_back(DocHitInfo(document_id)); } - last_document_id = document_id; } - if (hits_out != nullptr) { + if (hits_out != nullptr && last_document_in_namespace) { hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency()); } } return count; } -int LiteIndex::CountHits(uint32_t term_id) { +libtextclassifier3::StatusOr<int> LiteIndex::CountHits( + uint32_t term_id, const NamespaceChecker* namespace_checker) { return AppendHits(term_id, kSectionIdMaskAll, - /*only_from_prefix_sections=*/false, + /*only_from_prefix_sections=*/false, namespace_checker, /*hits_out=*/nullptr); } @@ -379,15 +391,16 @@ bool LiteIndex::is_full() const { lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction)); } -void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { - absl_ports::StrAppend( - out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n", - header_->cur_size(), - options_.hit_buffer_size)); - - // Lexicon. - out->append("Lexicon stats:\n"); - lexicon_.GetDebugInfo(verbosity, out); +IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo( + int verbosity) { + IndexDebugInfoProto::LiteIndexDebugInfoProto res; + res.set_curr_size(header_->cur_size()); + res.set_hit_buffer_size(options_.hit_buffer_size); + res.set_last_added_document_id(header_->last_added_docid()); + res.set_searchable_end(header_->searchable_end()); + res.set_index_crc(ComputeChecksum().Get()); + lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info()); + return res; } libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const { @@ -408,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { int64_t header_and_hit_buffer_file_size = filesystem_->GetFileSize(hit_buffer_fd_.get()); - if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) { - storage_info.set_lite_index_hit_buffer_size( - header_and_hit_buffer_file_size); - } else { - storage_info.set_lite_index_hit_buffer_size(-1); - } + storage_info.set_lite_index_hit_buffer_size( + IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size)); int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); if (lexicon_disk_usage != Filesystem::kBadFileSize) { storage_info.set_lite_index_lexicon_size(lexicon_disk_usage); diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index b134aba..378fc94 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -37,10 +37,12 @@ #include "icing/legacy/index/icing-lite-index-header.h" #include "icing/legacy/index/icing-lite-index-options.h" #include "icing/legacy/index/icing-mmapper.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/bit-util.h" #include "icing/util/crc32.h" @@ -140,13 +142,19 @@ class LiteIndex { // skipping hits in non-prefix sections if only_from_prefix_sections is true, // to hits_out. If hits_out is nullptr, no hits will be added. // + // Only those hits which belongs to the given namespaces will be counted and + // appended. A nullptr namespace checker will disable this check. + // // Returns the number of hits that would be added to hits_out. int AppendHits(uint32_t term_id, SectionIdMask section_id_mask, bool only_from_prefix_sections, + const NamespaceChecker* namespace_checker, std::vector<DocHitInfo>* hits_out); // Returns the hit count of the term. - int CountHits(uint32_t term_id); + // Only those hits which belongs to the given namespaces will be counted. + libtextclassifier3::StatusOr<int> CountHits( + uint32_t term_id, const NamespaceChecker* namespace_checker); // Check if buffer has reached its capacity. bool is_full() const; @@ -234,7 +242,7 @@ class LiteIndex { // Returns debug information for the index in out. // verbosity <= 0, simplest debug information - size of lexicon, hit buffer // verbosity > 0, more detailed debug information from the lexicon. - void GetDebugInfo(int verbosity, std::string* out) const; + IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity); // Returns the byte size of all the elements held in the index. This excludes // the size of any internal metadata of the index, e.g. the index's header. diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc new file mode 100644 index 0000000..825f830 --- /dev/null +++ b/icing/index/lite/lite-index_test.cc @@ -0,0 +1,110 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/lite/lite-index.h" + +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/index/term-id-codec.h" +#include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/schema/section.h" +#include "icing/store/namespace-checker.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::SizeIs; + +class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker { + public: + bool BelongsToTargetNamespaces(DocumentId document_id) const override { + return false; + } +}; + +class LiteIndexTest : public testing::Test { + protected: + void SetUp() override { + index_dir_ = GetTestTempDir() + "/test_dir"; + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str())); + + std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index"; + LiteIndex::Options options(lite_index_file_name, + /*hit_buffer_want_merge_bytes=*/1024 * 1024); + ICING_ASSERT_OK_AND_ASSIGN(lite_index_, + LiteIndex::Create(options, &icing_filesystem_)); + + ICING_ASSERT_OK_AND_ASSIGN( + term_id_codec_, + TermIdCodec::Create( + IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()), + IcingDynamicTrie::max_value_index(options.lexicon_options))); + } + + void TearDown() override { + ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str())); + } + + std::string index_dir_; + Filesystem filesystem_; + IcingFilesystem icing_filesystem_; + std::unique_ptr<LiteIndex> lite_index_; + std::unique_ptr<TermIdCodec> term_id_codec_; +}; + +constexpr NamespaceId kNamespace0 = 0; + +TEST_F(LiteIndexTest, LiteIndexAppendHits) { + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0)); + ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1)); + + std::vector<DocHitInfo> hits1; + lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + /*namespace_checker=*/nullptr, &hits1); + EXPECT_THAT(hits1, SizeIs(1)); + EXPECT_THAT(hits1.back().document_id(), Eq(0)); + // Check that the hits are coming from section 0 and section 1. + EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11)); + + std::vector<DocHitInfo> hits2; + AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker; + lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll, + /*only_from_prefix_sections=*/false, + &always_false_namespace_checker, &hits2); + // Check that no hits are returned because they get skipped by the namespace + // checker. + EXPECT_THAT(hits2, IsEmpty()); +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h index 8d5b50b..6c6fbb8 100644 --- a/icing/index/main/flash-index-storage.h +++ b/icing/index/main/flash-index-storage.h @@ -159,6 +159,7 @@ class FlashIndexStorage { libtextclassifier3::Status Reset(); + // TODO(b/222349894) Convert the string output to a protocol buffer instead. void GetDebugInfo(int verbosity, std::string* out) const; private: diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index b185138..2d6007b 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -133,18 +133,10 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const { IndexStorageInfoProto MainIndex::GetStorageInfo( IndexStorageInfoProto storage_info) const { - int64_t lexicon_elt_size = main_lexicon_->GetElementsSize(); - if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { - storage_info.set_main_index_lexicon_size(lexicon_elt_size); - } else { - storage_info.set_main_index_lexicon_size(-1); - } - int64_t index_elt_size = flash_index_storage_->GetElementsSize(); - if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { - storage_info.set_main_index_storage_size(index_elt_size); - } else { - storage_info.set_main_index_storage_size(-1); - } + storage_info.set_main_index_lexicon_size( + IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize())); + storage_info.set_main_index_storage_size( + Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize())); storage_info.set_main_index_block_size(flash_index_storage_->block_size()); storage_info.set_num_blocks(flash_index_storage_->num_blocks()); storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction()); @@ -186,7 +178,7 @@ MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) { if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) { // Found it, but it doesn't have prefix hits. Exit early. No need to // retrieve the posting list because there's nothing there for us. - return libtextclassifier3::Status::OK; + return absl_ports::NotFoundError("The term doesn't have any prefix hits."); } PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id)); @@ -217,35 +209,45 @@ bool IsTermInNamespaces( libtextclassifier3::StatusOr<std::vector<TermMetadata>> MainIndex::FindTermsByPrefix(const std::string& prefix, - const std::vector<NamespaceId>& namespace_ids) { + TermMatchType::Code term_match_type, + const NamespaceChecker* namespace_checker) { // Finds all the terms that start with the given prefix in the lexicon. IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str()); - // A property reader to help check if a term has some property. - IcingDynamicTrie::PropertyReadersAll property_reader(*main_lexicon_); - std::vector<TermMetadata> term_metadata_list; while (term_iterator.IsValid()) { - uint32_t term_value_index = term_iterator.GetValueIndex(); + int count = 0; + DocumentId last_document_id = kInvalidDocumentId; - // Skips the terms that don't exist in the given namespaces. We won't skip - // any terms if namespace_ids is empty. - if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) { - term_iterator.Advance(); - continue; - } PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid; memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id)); - // Getting the actual hit count would require reading the entire posting - // list chain. We take an approximation to avoid all of those IO ops. - // Because we are not reading the posting lists, it is impossible to - // differentiate between single max-size posting lists and chains of - // max-size posting lists. We assume that the impact on scoring is not - // significant. - int approx_hit_count = IndexBlock::ApproximateFullPostingListHitsForBlock( - flash_index_storage_->block_size(), - posting_list_id.posting_list_index_bits()); - term_metadata_list.emplace_back(term_iterator.GetKey(), approx_hit_count); + ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor, + PostingListAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_id)); + ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits, + pl_accessor.GetNextHitsBatch()); + for (const Hit& hit : hits) { + DocumentId document_id = hit.document_id(); + if (document_id != last_document_id) { + last_document_id = document_id; + if (term_match_type == TermMatchType::EXACT_ONLY && + hit.is_prefix_hit()) { + continue; + } + if (!namespace_checker->BelongsToTargetNamespaces(document_id)) { + // The document is removed or expired or not belongs to target + // namespaces. + continue; + } + // TODO(b/152934343) Add search type in SuggestionSpec to ask user to + // input search type, prefix or exact. And make different score strategy + // base on that. + ++count; + } + } + if (count > 0) { + term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), count)); + } term_iterator.Advance(); } @@ -605,16 +607,22 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits( return libtextclassifier3::Status::OK; } -void MainIndex::GetDebugInfo(int verbosity, std::string* out) const { +IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo( + int verbosity) const { + IndexDebugInfoProto::MainIndexDebugInfoProto res; + // Lexicon. - out->append("Main Lexicon stats:\n"); - main_lexicon_->GetDebugInfo(verbosity, out); + main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info()); + + res.set_last_added_document_id(last_added_document_id()); if (verbosity <= 0) { - return; + return res; } - flash_index_storage_->GetDebugInfo(verbosity, out); + flash_index_storage_->GetDebugInfo(verbosity, + res.mutable_flash_index_storage_info()); + return res; } } // namespace lib diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h index 919a5c5..abb0418 100644 --- a/icing/index/main/main-index.h +++ b/icing/index/main/main-index.h @@ -27,7 +27,9 @@ #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/storage.pb.h" +#include "icing/store/namespace-checker.h" #include "icing/store/namespace-id.h" #include "icing/util/status-macros.h" @@ -71,17 +73,17 @@ class MainIndex { // Finds terms with the given prefix in the given namespaces. If // 'namespace_ids' is empty, returns results from all the namespaces. The // input prefix must be normalized, otherwise inaccurate results may be - // returned. Results are not sorted specifically and are in lexigraphical - // order. Number of results are no more than 'num_to_return'. - // - // The hit count returned with each TermMetadata is an approximation based of - // posting list size. + // returned. If term_match_type is EXACT, only exact hit will be counted and + // it is PREFIX, both prefix and exact hits will be counted. Results are not + // sorted specifically and are in lexigraphical order. Number of results are + // no more than 'num_to_return'. // // Returns: // A list of TermMetadata on success // INTERNAL_ERROR if failed to access term data. libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( - const std::string& prefix, const std::vector<NamespaceId>& namespace_ids); + const std::string& prefix, TermMatchType::Code term_match_type, + const NamespaceChecker* namespace_checker); struct LexiconMergeOutputs { // Maps from main_lexicon tvi for new branching point to the main_lexicon @@ -184,7 +186,8 @@ class MainIndex { // verbosity <= 0, simplest debug information - just the lexicon // verbosity > 0, more detailed debug information including raw postings // lists. - void GetDebugInfo(int verbosity, std::string* out) const; + IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo( + int verbosity) const; private: libtextclassifier3::Status Init(const std::string& index_directory, diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc index 74139be..fa83d68 100644 --- a/icing/index/main/main-index_test.cc +++ b/icing/index/main/main-index_test.cc @@ -162,6 +162,34 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) { EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk()); } +TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) { + // 1. Index one doc in the Lite Index: + // - Doc0 {"foot" is_in_prefix_section=false} + ICING_ASSERT_OK_AND_ASSIGN( + uint32_t tvi, + lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0)); + ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id, + term_id_codec_->EncodeTvi(tvi, TviType::LITE)); + + Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency, + /*is_in_prefix_section=*/false); + ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit)); + + // 2. Create the main index. It should have no entries in its lexicon. + std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<MainIndex> main_index, + MainIndex::Create(main_index_file_name, &filesystem_, + &icing_filesystem_)); + + // 3. Merge the index. The main index should return not found when we search + // prefix contain "foo". + ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get())); + // GetAccessorForPrefixTerm should return a valid accessor for "foo". + EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) { // Create the main index. It should have no entries in its lexicon. std::string main_index_file_name = index_dir_ + "/test_file.idx.index"; diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc index baa043a..77876c4 100644 --- a/icing/legacy/index/icing-dynamic-trie.cc +++ b/icing/legacy/index/icing-dynamic-trie.cc @@ -70,6 +70,7 @@ #include <algorithm> #include <cerrno> #include <cinttypes> +#include <cstdint> #include <cstring> #include <memory> #include <utility> @@ -397,6 +398,8 @@ class IcingDynamicTrie::IcingDynamicTrieStorage { // storage. IcingScopedFd array_fds_[NUM_ARRAY_TYPES]; std::vector<IcingArrayStorage> array_storage_; + + // Legacy file system. Switch to use the new Filesystem class instead. const IcingFilesystem *filesystem_; }; @@ -1364,10 +1367,12 @@ uint32_t IcingDynamicTrie::size() const { return storage_->hdr().num_keys(); } -void IcingDynamicTrie::CollectStatsRecursive(const Node &node, - Stats *stats) const { +void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats, + uint32_t depth) const { if (node.is_leaf()) { stats->num_leaves++; + stats->sum_depth += depth; + stats->max_depth = max(stats->max_depth, depth); const char *suffix = storage_->GetSuffix(node.next_index()); stats->suffixes_used += strlen(suffix) + 1 + value_size(); if (!suffix[0]) { @@ -1379,13 +1384,16 @@ void IcingDynamicTrie::CollectStatsRecursive(const Node &node, for (; i < (1U << node.log2_num_children()); i++) { const Next &next = *storage_->GetNext(node.next_index(), i); if (next.node_index() == kInvalidNodeIndex) break; - CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats); + CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats, + depth + 1); } // At least one valid node in each next array if (i == 0) { ICING_LOG(FATAL) << "No valid node in 'next' array"; } + stats->sum_children += i; + stats->max_children = max(stats->max_children, i); stats->child_counts[i - 1]++; stats->wasted[node.log2_num_children()] += @@ -1467,9 +1475,12 @@ std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const { "Wasted total: %u\n" "Num intermediates %u num leaves %u " "suffixes used %u null %u\n" + "avg and max children for intermediates: %.3f, %u\n" + "avg and max depth for leaves: %.3f, %u\n" "Total next frag: %.3f%%\n", total_wasted, num_intermediates, num_leaves, suffixes_used, - null_suffixes, + null_suffixes, 1. * sum_children / num_intermediates, max_children, + 1. * sum_depth / num_leaves, max_depth, 100. * math_util::SafeDivide((total_free + total_wasted), num_nexts)); } IcingStringUtil::SStringAppendF( diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h index 8821799..013b926 100644 --- a/icing/legacy/index/icing-dynamic-trie.h +++ b/icing/legacy/index/icing-dynamic-trie.h @@ -152,8 +152,13 @@ class IcingDynamicTrie : public IIcingStorage { uint32_t max_nodes; // Count of intermediate nodes. uint32_t num_intermediates; + // Total and maximum number of children of intermediate nodes. + uint32_t sum_children, max_children; + // Count of leaf nodes. uint32_t num_leaves; + // Total and maximum depth of leaf nodes. + uint32_t sum_depth, max_depth; // Next stats @@ -186,6 +191,7 @@ class IcingDynamicTrie : public IIcingStorage { uint32_t dirty_pages_nexts; uint32_t dirty_pages_suffixes; + // TODO(b/222349894) Convert the string output to a protocol buffer instead. std::string DumpStats(int verbosity) const; }; @@ -601,7 +607,8 @@ class IcingDynamicTrie : public IIcingStorage { static const uint32_t kInvalidSuffixIndex; // Stats helpers. - void CollectStatsRecursive(const Node &node, Stats *stats) const; + void CollectStatsRecursive(const Node &node, Stats *stats, + uint32_t depth = 0) const; // Helpers for Find and Insert. const Next *GetNextByChar(const Node *node, uint8_t key_char) const; diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h index f645632..ce75a82 100644 --- a/icing/legacy/index/icing-filesystem.h +++ b/icing/legacy/index/icing-filesystem.h @@ -224,6 +224,11 @@ class IcingFilesystem { // Increments to_increment by size if size is valid, or sets to_increment // to kBadFileSize if either size or to_increment is kBadFileSize. static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment); + + // Return -1 if file_size is invalid. Otherwise, return file_size. + static int64_t SanitizeFileSize(int64_t file_size) { + return (file_size != kBadFileSize) ? file_size : -1; + } }; } // namespace lib diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h index e3ba0e2..6bb9591 100644 --- a/icing/legacy/index/icing-flash-bitmap.h +++ b/icing/legacy/index/icing-flash-bitmap.h @@ -138,6 +138,7 @@ class IcingFlashBitmap { // Upgrade for version 18. bool UpgradeTo18(); + // Legacy file system. Switch to use the new Filesystem class instead. const IcingFilesystem *const filesystem_; std::string filename_; OpenType open_type_; diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index bdd40aa..e48fe78 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -16,7 +16,6 @@ #include "gmock/gmock.h" #include "third_party/absl/flags/flag.h" #include "icing/document-builder.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/index.h" #include "icing/proto/term.pb.h" #include "icing/query/query-processor.h" @@ -24,6 +23,7 @@ #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index daeb479..950f739 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -23,7 +23,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" @@ -40,6 +39,7 @@ #include "icing/store/document-store.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -61,16 +61,16 @@ using ::testing::SizeIs; using ::testing::Test; using ::testing::UnorderedElementsAre; -constexpr PropertyConfigProto_DataType_Code TYPE_STRING = - PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; class QueryProcessorTest : public Test { protected: diff --git a/icing/query/suggestion-processor.cc b/icing/query/suggestion-processor.cc index 9c60810..cfa53f6 100644 --- a/icing/query/suggestion-processor.cc +++ b/icing/query/suggestion-processor.cc @@ -35,7 +35,7 @@ SuggestionProcessor::Create(Index* index, libtextclassifier3::StatusOr<std::vector<TermMetadata>> SuggestionProcessor::QuerySuggestions( const icing::lib::SuggestionSpecProto& suggestion_spec, - const std::vector<NamespaceId>& namespace_ids) { + const NamespaceChecker* namespace_checker) { // We use query tokenizer to tokenize the give prefix, and we only use the // last token to be the suggestion prefix. ICING_ASSIGN_OR_RETURN( @@ -73,8 +73,11 @@ SuggestionProcessor::QuerySuggestions( // lowercase. ICING_ASSIGN_OR_RETURN( std::vector<TermMetadata> terms, - index_.FindTermsByPrefix(normalizer_.NormalizeTerm(last_token), - namespace_ids, suggestion_spec.num_to_return())); + index_.FindTermsByPrefix( + normalizer_.NormalizeTerm(last_token), + suggestion_spec.num_to_return(), + suggestion_spec.scoring_spec().scoring_match_type(), + namespace_checker)); for (TermMetadata& term : terms) { term.content = query_prefix + term.content; @@ -90,4 +93,4 @@ SuggestionProcessor::SuggestionProcessor( normalizer_(*normalizer) {} } // namespace lib -} // namespace icing
\ No newline at end of file +} // namespace icing diff --git a/icing/query/suggestion-processor.h b/icing/query/suggestion-processor.h index b10dc84..088863e 100644 --- a/icing/query/suggestion-processor.h +++ b/icing/query/suggestion-processor.h @@ -48,7 +48,7 @@ class SuggestionProcessor { // INTERNAL_ERROR on all other errors libtextclassifier3::StatusOr<std::vector<TermMetadata>> QuerySuggestions( const SuggestionSpecProto& suggestion_spec, - const std::vector<NamespaceId>& namespace_ids); + const NamespaceChecker* namespace_checker); private: explicit SuggestionProcessor(Index* index, diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc index 5e62277..ba4c90a 100644 --- a/icing/query/suggestion-processor_test.cc +++ b/icing/query/suggestion-processor_test.cc @@ -15,10 +15,11 @@ #include "icing/query/suggestion-processor.h" #include "gmock/gmock.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/store/document-store.h" +#include "icing/testing/always-true-namespace-checker-impl.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -80,7 +81,6 @@ class SuggestionProcessorTest : public Test { DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_, schema_store_.get())); - document_store_ = std::move(create_result.document_store); } libtextclassifier3::Status AddTokenToIndex( @@ -93,7 +93,6 @@ class SuggestionProcessorTest : public Test { } void TearDown() override { - document_store_.reset(); filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); } @@ -103,7 +102,6 @@ class SuggestionProcessorTest : public Test { std::unique_ptr<Index> index_; std::unique_ptr<LanguageSegmenter> language_segmenter_; std::unique_ptr<Normalizer> normalizer_; - std::unique_ptr<DocumentStore> document_store_; std::unique_ptr<SchemaStore> schema_store_; std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache(); FakeClock fake_clock_; @@ -131,9 +129,10 @@ TEST_F(SuggestionProcessorTest, PrependedPrefixTokenTest) { "prefix token should be prepended to the suggestion f"); suggestion_spec.set_num_to_return(10); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + AlwaysTrueNamespaceCheckerImpl impl; + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermMetadata> terms, + suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms.at(0).content, "prefix token should be prepended to the suggestion foo"); } @@ -152,9 +151,10 @@ TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) { suggestion_spec.set_prefix("nonExistTerm"); suggestion_spec.set_num_to_return(10); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + AlwaysTrueNamespaceCheckerImpl impl; + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermMetadata> terms, + suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms, IsEmpty()); } @@ -173,9 +173,10 @@ TEST_F(SuggestionProcessorTest, PrefixTrailingSpaceTest) { suggestion_spec.set_prefix("f "); suggestion_spec.set_num_to_return(10); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + AlwaysTrueNamespaceCheckerImpl impl; + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermMetadata> terms, + suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms, IsEmpty()); } @@ -193,28 +194,26 @@ TEST_F(SuggestionProcessorTest, NormalizePrefixTest) { SuggestionSpecProto suggestion_spec; suggestion_spec.set_prefix("F"); suggestion_spec.set_num_to_return(10); + + AlwaysTrueNamespaceCheckerImpl impl; ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions(suggestion_spec, - /*namespace_ids=*/{})); + suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms.at(0).content, "foo"); suggestion_spec.set_prefix("fO"); ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, - /*namespace_ids=*/{})); + terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms.at(0).content, "foo"); suggestion_spec.set_prefix("Fo"); ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, - /*namespace_ids=*/{})); + terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms.at(0).content, "foo"); suggestion_spec.set_prefix("FO"); ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, - /*namespace_ids=*/{})); + terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms.at(0).content, "foo"); } @@ -235,9 +234,10 @@ TEST_F(SuggestionProcessorTest, OrOperatorPrefixTest) { suggestion_spec.set_prefix("f OR"); suggestion_spec.set_num_to_return(10); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + AlwaysTrueNamespaceCheckerImpl impl; + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermMetadata> terms, + suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); // Last Operator token will be used to query suggestion EXPECT_THAT(terms.at(0).content, "f original"); @@ -256,19 +256,20 @@ TEST_F(SuggestionProcessorTest, ParenthesesOperatorPrefixTest) { suggestion_spec.set_prefix("{f}"); suggestion_spec.set_num_to_return(10); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + AlwaysTrueNamespaceCheckerImpl impl; + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermMetadata> terms, + suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms, IsEmpty()); suggestion_spec.set_prefix("[f]"); - ICING_ASSERT_OK_AND_ASSIGN(terms, suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + ICING_ASSERT_OK_AND_ASSIGN( + terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms, IsEmpty()); suggestion_spec.set_prefix("(f)"); - ICING_ASSERT_OK_AND_ASSIGN(terms, suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + ICING_ASSERT_OK_AND_ASSIGN( + terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms, IsEmpty()); } @@ -286,15 +287,15 @@ TEST_F(SuggestionProcessorTest, OtherSpecialPrefixTest) { suggestion_spec.set_prefix("f:"); suggestion_spec.set_num_to_return(10); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + AlwaysTrueNamespaceCheckerImpl impl; + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermMetadata> terms, + suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms, IsEmpty()); suggestion_spec.set_prefix("f-"); ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor->QuerySuggestions(suggestion_spec, - /*namespace_ids=*/{})); + terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms, IsEmpty()); } @@ -312,9 +313,10 @@ TEST_F(SuggestionProcessorTest, InvalidPrefixTest) { suggestion_spec.set_prefix("OR OR - :"); suggestion_spec.set_num_to_return(10); - ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, - suggestion_processor->QuerySuggestions( - suggestion_spec, /*namespace_ids=*/{})); + AlwaysTrueNamespaceCheckerImpl impl; + ICING_ASSERT_OK_AND_ASSIGN( + std::vector<TermMetadata> terms, + suggestion_processor->QuerySuggestions(suggestion_spec, &impl)); EXPECT_THAT(terms, IsEmpty()); } diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 1c9684d..0d812e4 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -22,7 +22,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -36,6 +35,7 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -55,14 +55,14 @@ using ::testing::IsEmpty; using ::testing::Return; using ::testing::SizeIs; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; class ResultRetrieverTest : public testing::Test { protected: @@ -160,7 +160,7 @@ ResultSpecProto::SnippetSpecProto CreateSnippetSpec() { ResultSpecProto::SnippetSpecProto snippet_spec; snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max()); snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max()); - snippet_spec.set_max_window_bytes(1024); + snippet_spec.set_max_window_utf32_length(1024); return snippet_spec; } @@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) { TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) { MockFilesystem mock_filesystem; - ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false)); - + ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>())) + .WillByDefault(Return(false)); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_, diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc index 32e45aa..8a9005d 100644 --- a/icing/result/result-state-manager_test.cc +++ b/icing/result/result-state-manager_test.cc @@ -849,7 +849,7 @@ TEST_F(ResultStateManagerTest, ShouldGetSnippetContext) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1); result_spec.mutable_snippet_spec()->set_num_to_snippet(5); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_bytes(5); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); SearchSpecProto search_spec; search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); @@ -884,7 +884,7 @@ TEST_F(ResultStateManagerTest, ShouldGetDefaultSnippetContext) { // 0 indicates no snippeting result_spec.mutable_snippet_spec()->set_num_to_snippet(0); result_spec.mutable_snippet_spec()->set_num_matches_per_property(0); - result_spec.mutable_snippet_spec()->set_max_window_bytes(0); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(0); SearchSpecProto search_spec; search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc index f2121a5..d92fcfa 100644 --- a/icing/result/result-state_test.cc +++ b/icing/result/result-state_test.cc @@ -143,7 +143,7 @@ TEST_F(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2); result_spec.mutable_snippet_spec()->set_num_to_snippet(5); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_bytes(5); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); SectionRestrictQueryTermsMap query_terms_map; query_terms_map.emplace("term1", std::unordered_set<std::string>()); @@ -178,7 +178,7 @@ TEST_F(ResultStateTest, NoSnippetingShouldReturnNull) { // stored. result_spec.mutable_snippet_spec()->set_num_to_snippet(0); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); - result_spec.mutable_snippet_spec()->set_max_window_bytes(5); + result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5); SectionRestrictQueryTermsMap query_terms_map; query_terms_map.emplace("term1", std::unordered_set<std::string>()); diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index c46762e..bd1524e 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -41,6 +41,7 @@ #include "icing/transform/normalizer.h" #include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" +#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { @@ -75,6 +76,67 @@ inline std::string AddIndexToPath(int values_size, int index, kRBracket); } +// Returns a string of the normalized text of the input Token. Normalization +// is applied based on the Token's type. +std::string NormalizeToken(const Normalizer& normalizer, const Token& token) { + switch (token.type) { + case Token::Type::REGULAR: + return normalizer.NormalizeTerm(token.text); + case Token::Type::VERBATIM: + return std::string(token.text); + case Token::Type::QUERY_EXCLUSION: + [[fallthrough]]; + case Token::Type::QUERY_LEFT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_RIGHT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_OR: + [[fallthrough]]; + case Token::Type::QUERY_PROPERTY: + [[fallthrough]]; + case Token::Type::INVALID: + ICING_LOG(WARNING) << "Unable to normalize token of type: " + << static_cast<int>(token.type); + return std::string(token.text); + } +} + +// Returns a CharacterIterator for token's text, advancing one past the last +// matching character from the query term. +CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token, + const std::string& match_query_term) { + switch (token.type) { + case Token::Type::VERBATIM: { + // VERBATIM tokens are not normalized. This means the non-normalized + // matched query term must be either equal to or a prefix of the token's + // text. Therefore, the match must end at the end of the matched query + // term. + CharacterIterator verbatim_match_end = + CharacterIterator(token.text, 0, 0, 0); + verbatim_match_end.AdvanceToUtf8(match_query_term.length()); + return verbatim_match_end; + } + case Token::Type::QUERY_EXCLUSION: + [[fallthrough]]; + case Token::Type::QUERY_LEFT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_RIGHT_PARENTHESES: + [[fallthrough]]; + case Token::Type::QUERY_OR: + [[fallthrough]]; + case Token::Type::QUERY_PROPERTY: + [[fallthrough]]; + case Token::Type::INVALID: + ICING_LOG(WARNING) + << "Unexpected Token type " << static_cast<int>(token.type) + << " found when finding match end of query term and token."; + [[fallthrough]]; + case Token::Type::REGULAR: + return normalizer.FindNormalizedMatchEndPosition(token.text, + match_query_term); + } +} + class TokenMatcher { public: virtual ~TokenMatcher() = default; @@ -102,15 +164,16 @@ class TokenMatcherExact : public TokenMatcher { normalizer_(normalizer) {} CharacterIterator Matches(Token token) const override { - std::string s = normalizer_.NormalizeTerm(token.text); + std::string s = NormalizeToken(normalizer_, token); auto itr = unrestricted_query_terms_.find(s); if (itr == unrestricted_query_terms_.end()) { itr = restricted_query_terms_.find(s); } if (itr != unrestricted_query_terms_.end() && itr != restricted_query_terms_.end()) { - return normalizer_.FindNormalizedMatchEndPosition(token.text, *itr); + return FindMatchEnd(normalizer_, token, *itr); } + return CharacterIterator(token.text, -1, -1, -1); } @@ -131,19 +194,17 @@ class TokenMatcherPrefix : public TokenMatcher { normalizer_(normalizer) {} CharacterIterator Matches(Token token) const override { - std::string s = normalizer_.NormalizeTerm(token.text); + std::string s = NormalizeToken(normalizer_, token); for (const std::string& query_term : unrestricted_query_terms_) { if (query_term.length() <= s.length() && s.compare(0, query_term.length(), query_term) == 0) { - return normalizer_.FindNormalizedMatchEndPosition(token.text, - query_term); + return FindMatchEnd(normalizer_, token, query_term); } } for (const std::string& query_term : restricted_query_terms_) { if (query_term.length() <= s.length() && s.compare(0, query_term.length(), query_term) == 0) { - return normalizer_.FindNormalizedMatchEndPosition(token.text, - query_term); + return FindMatchEnd(normalizer_, token, query_term); } } return CharacterIterator(token.text, -1, -1, -1); @@ -184,7 +245,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart( const ResultSpecProto::SnippetSpecProto& snippet_spec, std::string_view value, int window_start_min_exclusive_utf32, Tokenizer::Iterator* iterator) { - if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) { + if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } @@ -219,7 +280,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd( const ResultSpecProto::SnippetSpecProto& snippet_spec, std::string_view value, int window_end_max_exclusive_utf32, Tokenizer::Iterator* iterator) { - if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) { + if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } @@ -283,9 +344,9 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32; int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2; int window_start_min_exclusive_utf32 = - (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1; + (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1; int window_end_max_exclusive_utf32 = - match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2; + match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2; snippet_match.set_exact_match_byte_position(start_itr.utf8_index()); snippet_match.set_exact_match_utf16_position(start_itr.utf16_index()); @@ -296,7 +357,7 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( // Only include windows if it'll at least include the matched text. Otherwise, // it'll just be an empty string anyways. - if (snippet_spec.max_window_bytes() >= match_len_utf32) { + if (snippet_spec.max_window_utf32_length() >= match_len_utf32) { // Find the beginning of the window. ICING_ASSIGN_OR_RETURN( CharacterIterator window_start, @@ -337,8 +398,13 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( // DetermineWindowStart/End may change the position of the iterator. So, // reset the iterator back to the original position. - bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1) - : iterator->ResetToStart(); + bool success = false; + if (match_pos_utf32 > 0) { + success = iterator->ResetToTokenStartingAfter(match_pos_utf32 - 1); + } else { + success = iterator->ResetToStart(); + } + if (!success) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index f811941..0de2295 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -22,7 +22,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -37,6 +36,7 @@ #include "icing/store/key-mapper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" @@ -58,16 +58,18 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = - PropertyConfigProto_Cardinality_Code_REPEATED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM = + StringIndexingConfig::TokenizerType::VERBATIM; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) { std::vector<std::string_view> paths; @@ -131,7 +133,7 @@ class SnippetRetrieverTest : public testing::Test { snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max()); snippet_spec_.set_num_matches_per_property( std::numeric_limits<int32_t>::max()); - snippet_spec_.set_max_window_bytes(64); + snippet_spec_.set_max_window_utf32_length(64); } void TearDown() override { @@ -178,7 +180,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) { // Window starts at the beginning of "three" and ends in the middle of // "three". len=4, orig_window= "thre" - snippet_spec_.set_max_window_bytes(4); + snippet_spec_.set_max_window_utf32_length(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -204,7 +206,7 @@ TEST_F(SnippetRetrieverTest, // Window starts at the beginning of "three" and at the exact end of // "three". len=5, orig_window= "three" - snippet_spec_.set_max_window_bytes(5); + snippet_spec_.set_max_window_utf32_length(5); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -230,7 +232,7 @@ TEST_F(SnippetRetrieverTest, // Window starts at the beginning of "four" and at the exact end of // "four". len=4, orig_window= "four" - snippet_spec_.set_max_window_bytes(4); + snippet_spec_.set_max_window_utf32_length(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -262,7 +264,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) { // 1. untrimmed, no-shifting window will be (2,17). // 2. trimmed, no-shifting window [4,13) "two three" // 3. trimmed, shifted window [4,18) "two three four" - snippet_spec_.set_max_window_bytes(14); + snippet_spec_.set_max_window_utf32_length(14); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -295,7 +297,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) { // 1. untrimmed, no-shifting window will be (1,18). // 2. trimmed, no-shifting window [4,18) "two three four" // 3. trimmed, shifted window [4,20) "two three four.." - snippet_spec_.set_max_window_bytes(16); + snippet_spec_.set_max_window_utf32_length(16); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -321,7 +323,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) { // Window ends in the middle of all the punctuation and window starts at 0. // len=20, orig_window="one two three four.." - snippet_spec_.set_max_window_bytes(20); + snippet_spec_.set_max_window_utf32_length(20); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -349,7 +351,7 @@ TEST_F(SnippetRetrieverTest, // Window ends in the middle of all the punctuation and window starts at 0. // len=26, orig_window="pside down in Australia¿" - snippet_spec_.set_max_window_bytes(24); + snippet_spec_.set_max_window_utf32_length(24); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -377,7 +379,7 @@ TEST_F(SnippetRetrieverTest, // Window ends in the middle of all the punctuation and window starts at 0. // len=26, orig_window="upside down in Australia¿ " - snippet_spec_.set_max_window_bytes(26); + snippet_spec_.set_max_window_utf32_length(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -410,7 +412,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) { // 1. untrimmed, no-shifting window will be (-2,21). // 2. trimmed, no-shifting window [0,21) "one two three four..." // 3. trimmed, shifted window [0,22) "one two three four...." - snippet_spec_.set_max_window_bytes(22); + snippet_spec_.set_max_window_utf32_length(22); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -436,7 +438,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) { // Window ends before "five" but after all the punctuation // len=26, orig_window="one two three four.... " - snippet_spec_.set_max_window_bytes(26); + snippet_spec_.set_max_window_utf32_length(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -469,7 +471,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) { // 1. untrimmed, no-shifting window will be ((-7,26). // 2. trimmed, no-shifting window [0,26) "one two three four...." // 3. trimmed, shifted window [0,27) "one two three four.... five" - snippet_spec_.set_max_window_bytes(32); + snippet_spec_.set_max_window_utf32_length(32); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -495,7 +497,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) { // Max window size equals the size of the value. // len=34, orig_window="one two three four.... five" - snippet_spec_.set_max_window_bytes(34); + snippet_spec_.set_max_window_utf32_length(34); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -521,7 +523,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) { // Max window size exceeds the size of the value. // len=36, orig_window="one two three four.... five" - snippet_spec_.set_max_window_bytes(36); + snippet_spec_.set_max_window_utf32_length(36); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -555,7 +557,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) { // 1. untrimmed, no-shifting window will be (-10,19). // 2. trimmed, no-shifting window [0,19) "one two three four." // 3. trimmed, shifted window [0,27) "one two three four.... five" - snippet_spec_.set_max_window_bytes(28); + snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -589,7 +591,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) { // 1. untrimmed, no-shifting window will be (10,39). // 2. trimmed, no-shifting window [14,31) "four.... five six" // 3. trimmed, shifted window [4,31) "two three four.... five six" - snippet_spec_.set_max_window_bytes(28); + snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -623,7 +625,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) { // 1. untrimmed, no-shifting window will be (-10,19). // 2. trimmed, no-shifting window [0, 19) "one two three four." // 3. trimmed, shifted window [0, 22) "one two three four...." - snippet_spec_.set_max_window_bytes(28); + snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -657,7 +659,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) { // 1. untrimmed, no-shifting window will be (1,30). // 2. trimmed, no-shifting window [4, 22) "two three four...." // 3. trimmed, shifted window [0, 22) "one two three four...." - snippet_spec_.set_max_window_bytes(28); + snippet_spec_.set_max_window_utf32_length(28); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); @@ -721,7 +723,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { .AddStringProperty("body", "Only a fool would match this content.") .Build(); - snippet_spec_.set_max_window_bytes(0); + snippet_spec_.set_max_window_utf32_length(0); SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}}; @@ -1473,7 +1475,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { // 1. untrimmed, no-shifting window will be (0,7). // 2. trimmed, no-shifting window [1, 6) "每天走路去". // 3. trimmed, shifted window [0, 6) "我每天走路去" - snippet_spec_.set_max_window_bytes(6); + snippet_spec_.set_max_window_utf32_length(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); @@ -1572,7 +1574,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { // UTF8 idx: 9 22 // UTF16 idx: 5 12 // UTF32 idx: 3 7 - snippet_spec_.set_max_window_bytes(6); + snippet_spec_.set_max_window_utf32_length(6); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); @@ -1596,6 +1598,117 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { EXPECT_THAT(match_proto.window_utf16_length(), Eq(7)); } +TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("verbatimType") + .AddProperty(PropertyConfigBuilder() + .SetName("verbatim") + .SetDataTypeString(MATCH_EXACT, + TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + DocumentProto document = DocumentBuilder() + .SetKey("icing", "verbatim/1") + .SetSchema("verbatimType") + .AddStringProperty("verbatim", "Hello, world!") + .Build(); + + SectionIdMask section_mask = 0b00000001; + SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}}; + + snippet_spec_.set_max_window_utf32_length(13); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + // There should only be one snippet entry and match, the verbatim token in its + // entirety. + ASSERT_THAT(snippet.entries(), SizeIs(1)); + + const SnippetProto::EntryProto* entry = &snippet.entries(0); + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + ASSERT_THAT(entry->property_name(), "verbatim"); + + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + // We expect the match to begin at position 0, and to span the entire token + // which contains 13 characters. + EXPECT_THAT(match_proto.window_byte_position(), Eq(0)); + EXPECT_THAT(match_proto.window_utf16_length(), Eq(13)); + + // We expect the submatch to begin at position 0 of the verbatim token and + // span the length of our query term "Hello, world!", which has utf-16 length + // of 13. The submatch length is equal to the window length as the query the + // snippet is retrieved with an exact term match. + EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0)); + EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13)); +} + +TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("verbatimType") + .AddProperty(PropertyConfigBuilder() + .SetName("verbatim") + .SetDataTypeString(MATCH_PREFIX, + TOKENIZER_VERBATIM) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF8 idx: 0 3 9 15 18 + // UTF16 idx: 0 1 3 5 6 + // UTF32 idx: 0 1 3 5 6 + // Breaks into segments: "我", "每天", "走路", "去", "上班" + std::string chinese_string = "我每天走路去上班。"; + DocumentProto document = DocumentBuilder() + .SetKey("icing", "verbatim/1") + .SetSchema("verbatimType") + .AddStringProperty("verbatim", chinese_string) + .Build(); + + SectionIdMask section_mask = 0b00000001; + SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}}; + + snippet_spec_.set_max_window_utf32_length(9); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + // There should only be one snippet entry and match, the verbatim token in its + // entirety. + ASSERT_THAT(snippet.entries(), SizeIs(1)); + + const SnippetProto::EntryProto* entry = &snippet.entries(0); + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + ASSERT_THAT(entry->property_name(), "verbatim"); + + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + // We expect the match to begin at position 0, and to span the entire token + // which has utf-16 length of 9. + EXPECT_THAT(match_proto.window_byte_position(), Eq(0)); + EXPECT_THAT(match_proto.window_utf16_length(), Eq(9)); + + // We expect the submatch to begin at position 0 of the verbatim token and + // span the length of our query term "我每", which has utf-16 length of 2. + EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0)); + EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); +} + } // namespace } // namespace lib diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index 67528ab..acc5030 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -268,7 +268,7 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) { libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). schema_type_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete( filesystem_, MakeSchemaTypeMapperFilename(base_dir_)); @@ -464,11 +464,8 @@ libtextclassifier3::Status SchemaStore::PersistToDisk() { SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const { SchemaStoreStorageInfoProto storage_info; int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str()); - if (directory_size != Filesystem::kBadFileSize) { - storage_info.set_schema_store_size(directory_size); - } else { - storage_info.set_schema_store_size(-1); - } + storage_info.set_schema_store_size( + Filesystem::SanitizeFileSize(directory_size)); ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info); storage_info.set_num_schema_types(schema->types_size()); int total_sections = 0; @@ -496,5 +493,17 @@ SchemaStore::GetSectionMetadata(const std::string& schema_type) const { return section_manager_->GetMetadataList(schema_type); } +libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo() + const { + SchemaDebugInfoProto debug_info; + if (has_schema_successfully_set_) { + ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema()); + *debug_info.mutable_schema() = *schema; + } + ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum()); + debug_info.set_crc(crc.Get()); + return debug_info; +} + } // namespace lib } // namespace icing diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index 6b6528d..2d3aca7 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -26,6 +26,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/file/file-backed-proto.h" #include "icing/file/filesystem.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/document.pb.h" #include "icing/proto/logging.pb.h" #include "icing/proto/schema.pb.h" @@ -137,9 +138,7 @@ class SchemaStore { // Persists and updates checksum of subcomponents. ~SchemaStore(); - // Retrieve the current schema if it exists. Caller does not get ownership of - // the schema proto and modifying the returned pointer does not affect the - // underlying schema proto. + // Retrieve the current schema if it exists. // // Returns: // SchemaProto* if exists @@ -258,6 +257,13 @@ class SchemaStore { // that field will be set to -1. SchemaStoreStorageInfoProto GetStorageInfo() const; + // Get debug information for the schema store. + // + // Returns: + // SchemaDebugInfoProto on success + // INTERNAL_ERROR on IO errors, crc compute error + libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const; + private: // Use SchemaStore::Create instead. explicit SchemaStore(const Filesystem* filesystem, std::string base_dir, diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index be7170f..113084e 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -44,23 +44,24 @@ using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::ElementsAre; using ::testing::Eq; using ::testing::Ge; +using ::testing::Gt; using ::testing::Not; using ::testing::Pointee; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = - PropertyConfigProto_Cardinality_Code_REPEATED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr PropertyConfigProto_DataType_Code TYPE_STRING = - PropertyConfigProto_DataType_Code_STRING; -constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE = - PropertyConfigProto_DataType_Code_DOUBLE; +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE = + PropertyConfigProto::DataType::DOUBLE; class SchemaStoreTest : public ::testing::Test { protected: @@ -868,6 +869,38 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) { EXPECT_THAT(storage_info.num_schema_types_sections_exhausted(), Eq(1)); } +TEST_F(SchemaStoreTest, GetDebugInfo) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); + + // Set schema + ASSERT_THAT( + schema_store->SetSchema(schema_), + IsOkAndHolds(EqualsSetSchemaResult(SchemaStore::SetSchemaResult{ + .success = true, + .schema_types_new_by_name = {schema_.types(0).schema_type()}}))); + + // Check debug info + ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out, + schema_store->GetDebugInfo()); + EXPECT_THAT(out.schema(), EqualsProto(schema_)); + EXPECT_THAT(out.crc(), Gt(0)); +} + +TEST_F(SchemaStoreTest, GetDebugInfoForEmptySchemaStore) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); + + // Check debug info before setting a schema + ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out, + schema_store->GetDebugInfo()); + SchemaDebugInfoProto expected_out; + expected_out.set_crc(0); + EXPECT_THAT(out, EqualsProto(expected_out)); +} + } // namespace } // namespace lib diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index 26ef4c7..f28a2f8 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -38,32 +38,32 @@ constexpr char kEmailType[] = "EmailMessage"; constexpr char kMessageType[] = "Text"; constexpr char kPersonType[] = "Person"; -constexpr PropertyConfigProto_DataType_Code TYPE_DOCUMENT = - PropertyConfigProto_DataType_Code_DOCUMENT; -constexpr PropertyConfigProto_DataType_Code TYPE_STRING = - PropertyConfigProto_DataType_Code_STRING; -constexpr PropertyConfigProto_DataType_Code TYPE_INT = - PropertyConfigProto_DataType_Code_INT64; -constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE = - PropertyConfigProto_DataType_Code_DOUBLE; - -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_UNKNOWN = - PropertyConfigProto_Cardinality_Code_UNKNOWN; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = - PropertyConfigProto_Cardinality_Code_REQUIRED; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = - PropertyConfigProto_Cardinality_Code_REPEATED; - -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = - StringIndexingConfig_TokenizerType_Code_NONE; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; - -constexpr TermMatchType_Code MATCH_UNKNOWN = TermMatchType_Code_UNKNOWN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; -constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT = + PropertyConfigProto::DataType::DOCUMENT; +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; +constexpr PropertyConfigProto::DataType::Code TYPE_INT = + PropertyConfigProto::DataType::INT64; +constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE = + PropertyConfigProto::DataType::DOUBLE; + +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN = + PropertyConfigProto::Cardinality::UNKNOWN; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = + PropertyConfigProto::Cardinality::REQUIRED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; + +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE = + StringIndexingConfig::TokenizerType::NONE; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; + +constexpr TermMatchType::Code MATCH_UNKNOWN = TermMatchType::UNKNOWN; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; +constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX; TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) { // Create a schema with the following dependencies: diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc index f22a31a..fef612d 100644 --- a/icing/scoring/scorer_test.cc +++ b/icing/scoring/scorer_test.cc @@ -40,11 +40,11 @@ namespace lib { namespace { using ::testing::Eq; -constexpr PropertyConfigProto_DataType_Code TYPE_STRING = - PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = - PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = + PropertyConfigProto::Cardinality::REQUIRED; class ScorerTest : public testing::Test { protected: diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc index 7e5cb0f..f169039 100644 --- a/icing/scoring/scoring-processor_test.cc +++ b/icing/scoring/scoring-processor_test.cc @@ -34,14 +34,16 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Gt; using ::testing::IsEmpty; using ::testing::SizeIs; -constexpr PropertyConfigProto_DataType_Code TYPE_STRING = - PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; class ScoringProcessorTest : public testing::Test { protected: @@ -789,6 +791,77 @@ TEST_F(ScoringProcessorTest, ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit))); } +TEST_F(ScoringProcessorTest, + ShouldScoreByRelevanceScore_WithZeroPropertyWeight) { + DocumentProto document1 = + CreateDocument("icing", "email/1", kDefaultScore, + /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); + DocumentProto document2 = + CreateDocument("icing", "email/2", kDefaultScore, + /*creation_timestamp_ms=*/kDefaultCreationTimestampMs); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id1, + document_store()->Put(document1, /*num_tokens=*/1)); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id2, + document_store()->Put(document2, /*num_tokens=*/1)); + + // Document 1 contains the term "foo" 1 time in the "body" property + SectionId body_section_id = 0; + DocHitInfo doc_hit_info1(document_id1); + doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1); + + // Document 2 contains the term "foo" 1 time in the "subject" property + SectionId subject_section_id = 1; + DocHitInfo doc_hit_info2(document_id2); + doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1); + + // Creates input doc_hit_infos and expected output scored_document_hits + std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2}; + + // Creates a dummy DocHitInfoIterator with 2 results for the query "foo" + std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator = + std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); + + ScoringSpecProto spec_proto; + spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); + + // Sets property weight for "body" to 0.0. + PropertyWeight body_property_weight = + CreatePropertyWeight(/*path=*/"body", /*weight=*/0.0); + // Sets property weight for "subject" to 1.0. + PropertyWeight subject_property_weight = + CreatePropertyWeight(/*path=*/"subject", /*weight=*/1.0); + *spec_proto.add_type_property_weights() = CreateTypePropertyWeights( + /*schema_type=*/"email", {body_property_weight, subject_property_weight}); + + // Creates a ScoringProcessor + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ScoringProcessor> scoring_processor, + ScoringProcessor::Create(spec_proto, document_store(), schema_store())); + + std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> + query_term_iterators; + query_term_iterators["foo"] = + std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo"); + + std::vector<ScoredDocumentHit> scored_document_hits = + scoring_processor->Score(std::move(doc_hit_info_iterator), + /*num_to_score=*/2, &query_term_iterators); + + // We expect document1 to have a score of 0.0 as the query term "foo" matches + // in the "body" property which has a weight of 0.0. This is a result of the + // weighted term frequency being scaled down to 0.0 for the hit. We expect + // document2 to have a positive score as the query term "foo" matches in the + // "subject" property which has a weight of 1.0. + EXPECT_THAT(scored_document_hits, SizeIs(2)); + EXPECT_THAT(scored_document_hits.at(0).document_id(), Eq(document_id1)); + EXPECT_THAT(scored_document_hits.at(0).score(), Eq(0.0)); + EXPECT_THAT(scored_document_hits.at(1).document_id(), Eq(document_id2)); + EXPECT_THAT(scored_document_hits.at(1).score(), Gt(0.0)); +} + TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) { DocumentProto document1 = CreateDocument("icing", "email/1", kDefaultScore, diff --git a/icing/scoring/section-weights.cc b/icing/scoring/section-weights.cc index c4afe7f..ed7cd5e 100644 --- a/icing/scoring/section-weights.cc +++ b/icing/scoring/section-weights.cc @@ -27,10 +27,14 @@ namespace lib { namespace { -// Normalizes all weights in the map to be in range (0.0, 1.0], where the max -// weight is normalized to 1.0. +// Normalizes all weights in the map to be in range [0.0, 1.0], where the max +// weight is normalized to 1.0. In the case that all weights are equal to 0.0, +// the normalized weight for each will be 0.0. inline void NormalizeSectionWeights( double max_weight, std::unordered_map<SectionId, double>& section_weights) { + if (max_weight == 0.0) { + return; + } for (auto& raw_weight : section_weights) { raw_weight.second = raw_weight.second / max_weight; } @@ -70,11 +74,11 @@ SectionWeights::Create(const SchemaStore* schema_store, type_property_weights.property_weights()) { double property_path_weight = property_weight.weight(); - // Return error on negative and zero weights. - if (property_path_weight <= 0.0) { + // Return error on negative weights. + if (property_path_weight < 0.0) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Property weight for property path \"%s\" is negative or zero. " - "Negative and zero weights are invalid.", + "Property weight for property path \"%s\" is negative. Negative " + "weights are invalid.", property_weight.path().c_str())); } property_paths_weights.insert( @@ -116,7 +120,7 @@ inline SectionWeights::NormalizedSectionWeights SectionWeights::ExtractNormalizedSectionWeights( const std::unordered_map<std::string, double>& raw_weights, const std::vector<SectionMetadata>& metadata_list) { - double max_weight = 0.0; + double max_weight = -std::numeric_limits<double>::infinity(); std::unordered_map<SectionId, double> section_weights; for (const SectionMetadata& section_metadata : metadata_list) { std::string_view metadata_path = section_metadata.path; @@ -132,10 +136,11 @@ SectionWeights::ExtractNormalizedSectionWeights( NormalizeSectionWeights(max_weight, section_weights); // Set normalized default weight to 1.0 in case there is no section - // metadata and max_weight is 0.0 (we should not see this case). - double normalized_default_weight = max_weight == 0.0 - ? kDefaultSectionWeight - : kDefaultSectionWeight / max_weight; + // metadata and max_weight is -INF (we should not see this case). + double normalized_default_weight = + max_weight == -std::numeric_limits<double>::infinity() + ? kDefaultSectionWeight + : kDefaultSectionWeight / max_weight; SectionWeights::NormalizedSectionWeights normalized_section_weights = SectionWeights::NormalizedSectionWeights(); normalized_section_weights.section_weights = std::move(section_weights); diff --git a/icing/scoring/section-weights_test.cc b/icing/scoring/section-weights_test.cc index b90c3d5..330faee 100644 --- a/icing/scoring/section-weights_test.cc +++ b/icing/scoring/section-weights_test.cc @@ -48,13 +48,13 @@ class SectionWeightsTest : public testing::Test { SchemaTypeConfigProto sender_schema = SchemaTypeConfigBuilder() .SetType("sender") - .AddProperty(PropertyConfigBuilder() - .SetName("name") - .SetDataTypeString( - TermMatchType::PREFIX, - StringIndexingConfig::TokenizerType::PLAIN) - .SetCardinality( - PropertyConfigProto_Cardinality_Code_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString( + TermMatchType::PREFIX, + StringIndexingConfig::TokenizerType::PLAIN) + .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)) .Build(); SchemaTypeConfigProto email_schema = SchemaTypeConfigBuilder() @@ -65,24 +65,22 @@ class SectionWeightsTest : public testing::Test { .SetDataTypeString( TermMatchType::PREFIX, StringIndexingConfig::TokenizerType::PLAIN) - .SetDataType(PropertyConfigProto_DataType_Code_STRING) - .SetCardinality( - PropertyConfigProto_Cardinality_Code_OPTIONAL)) + .SetDataType(PropertyConfigProto::DataType::STRING) + .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)) .AddProperty( PropertyConfigBuilder() .SetName("body") .SetDataTypeString( TermMatchType::PREFIX, StringIndexingConfig::TokenizerType::PLAIN) - .SetDataType(PropertyConfigProto_DataType_Code_STRING) - .SetCardinality( - PropertyConfigProto_Cardinality_Code_OPTIONAL)) - .AddProperty(PropertyConfigBuilder() - .SetName("sender") - .SetDataTypeDocument( - "sender", /*index_nested_properties=*/true) - .SetCardinality( - PropertyConfigProto_Cardinality_Code_OPTIONAL)) + .SetDataType(PropertyConfigProto::DataType::STRING) + .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument("sender", + /*index_nested_properties=*/true) + .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)) .Build(); SchemaProto schema = SchemaBuilder().AddType(sender_schema).AddType(email_schema).Build(); @@ -171,20 +169,79 @@ TEST_F(SectionWeightsTest, ShouldFailWithNegativeWeights) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SectionWeightsTest, ShouldFailWithZeroWeight) { +TEST_F(SectionWeightsTest, ShouldAcceptZeroWeight) { ScoringSpecProto spec_proto; TypePropertyWeights *type_property_weights = spec_proto.add_type_property_weights(); - type_property_weights->set_schema_type("sender"); + type_property_weights->set_schema_type("email"); - PropertyWeight *property_weight = + PropertyWeight *body_property_weight = type_property_weights->add_property_weights(); - property_weight->set_weight(0.0); - property_weight->set_path("name"); + body_property_weight->set_weight(2.0); + body_property_weight->set_path("body"); - EXPECT_THAT(SectionWeights::Create(schema_store(), spec_proto).status(), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + PropertyWeight *subject_property_weight = + type_property_weights->add_property_weights(); + subject_property_weight->set_weight(0.0); + subject_property_weight->set_path("subject"); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SectionWeights> section_weights, + SectionWeights::Create(schema_store(), spec_proto)); + ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, + schema_store()->GetSchemaTypeId("email")); + + // Normalized weight for "body" property. + EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, + /*section_id=*/0), + Eq(1.0)); + // Normalized weight for "subject" property. + EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, + /*section_id=*/2), + Eq(0.0)); +} + +TEST_F(SectionWeightsTest, ShouldNormalizeToZeroWhenAllWeightsZero) { + ScoringSpecProto spec_proto; + + TypePropertyWeights *type_property_weights = + spec_proto.add_type_property_weights(); + type_property_weights->set_schema_type("email"); + + PropertyWeight *body_property_weight = + type_property_weights->add_property_weights(); + body_property_weight->set_weight(0.0); + body_property_weight->set_path("body"); + + PropertyWeight *sender_property_weight = + type_property_weights->add_property_weights(); + sender_property_weight->set_weight(0.0); + sender_property_weight->set_path("sender.name"); + + PropertyWeight *subject_property_weight = + type_property_weights->add_property_weights(); + subject_property_weight->set_weight(0.0); + subject_property_weight->set_path("subject"); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SectionWeights> section_weights, + SectionWeights::Create(schema_store(), spec_proto)); + ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, + schema_store()->GetSchemaTypeId("email")); + + // Normalized weight for "body" property. + EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, + /*section_id=*/0), + Eq(0.0)); + // Normalized weight for "sender.name" property (the nested property). + EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, + /*section_id=*/1), + Eq(0.0)); + // Normalized weight for "subject" property. + EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id, + /*section_id=*/2), + Eq(0.0)); } TEST_F(SectionWeightsTest, ShouldReturnDefaultIfTypePropertyWeightsNotSet) { diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc index 5e0426e..5e23a8e 100644 --- a/icing/store/document-log-creator.cc +++ b/icing/store/document-log-creator.cc @@ -72,19 +72,20 @@ DocumentLogCreator::Create(const Filesystem* filesystem, bool v1_exists = filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str()); - bool regen_derived_files = false; + bool new_file = false; + int preexisting_file_version = kCurrentVersion; if (v0_exists && !v1_exists) { ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir)); // Need to regenerate derived files since documents may be written to a // different file offset in the log. - regen_derived_files = true; + preexisting_file_version = 0; } else if (!v1_exists) { // First time initializing a v1 log. There are no existing derived files at // this point, so we should generate some. "regenerate" here also means // "generate for the first time", i.e. we shouldn't expect there to be any // existing derived files. - regen_derived_files = true; + new_file = true; } ICING_ASSIGN_OR_RETURN( @@ -96,7 +97,7 @@ DocumentLogCreator::Create(const Filesystem* filesystem, /*compress_in=*/true))); CreateResult create_result = {std::move(log_create_result), - regen_derived_files}; + preexisting_file_version, new_file}; return create_result; } diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h index 51cf497..be8feed 100644 --- a/icing/store/document-log-creator.h +++ b/icing/store/document-log-creator.h @@ -30,14 +30,20 @@ namespace lib { // be necessary. class DocumentLogCreator { public: + // Version 0 refers to FileBackedProtoLog + // Version 1 refers to PortableFileBackedProtoLog with kFileFormatVersion = 0 + static constexpr int32_t kCurrentVersion = 1; struct CreateResult { // The create result passed up from the PortableFileBackedProtoLog::Create. // Contains the document log. PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result; - // Whether the caller needs to also regenerate/generate any derived files - // based off of the initialized document log. - bool regen_derived_files; + // The version number of the pre-existing document log file. + // If there is no document log file, it will be set to kCurrentVersion. + int preexisting_file_version; + + // Whether the created file is new. + bool new_file; }; // Creates the document log in the base_dir. Will create one if it doesn't diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 226a96b..8c8369c 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -164,6 +164,32 @@ int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms, return expiration_timestamp_ms; } +InitializeStatsProto::RecoveryCause GetRecoveryCause( + const DocumentLogCreator::CreateResult& create_result, + bool force_recovery_and_revalidate_documents) { + if (force_recovery_and_revalidate_documents) { + return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC; + } else if (create_result.log_create_result.has_data_loss()) { + return InitializeStatsProto::DATA_LOSS; + } else if (create_result.preexisting_file_version != + DocumentLogCreator::kCurrentVersion) { + return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT; + } + return InitializeStatsProto::NONE; +} + +InitializeStatsProto::DocumentStoreDataStatus GetDataStatus( + DataLoss data_loss) { + switch (data_loss) { + case DataLoss::PARTIAL: + return InitializeStatsProto::PARTIAL_LOSS; + case DataLoss::COMPLETE: + return InitializeStatsProto::COMPLETE_LOSS; + case DataLoss::NONE: + return InitializeStatsProto::NO_DATA_LOSS; + } +} + } // namespace DocumentStore::DocumentStore(const Filesystem* filesystem, @@ -236,44 +262,34 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( std::move(create_result_or).ValueOrDie(); document_log_ = std::move(create_result.log_create_result.proto_log); - - if (create_result.regen_derived_files || - force_recovery_and_revalidate_documents || - create_result.log_create_result.has_data_loss()) { + InitializeStatsProto::RecoveryCause recovery_cause = + GetRecoveryCause(create_result, force_recovery_and_revalidate_documents); + + if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) { + ICING_LOG(WARNING) << "Starting Document Store Recovery with cause=" + << recovery_cause << ", and create result { new_file=" + << create_result.new_file << ", preeisting_file_version=" + << create_result.preexisting_file_version << ", data_loss=" + << create_result.log_create_result.data_loss << "} and kCurrentVersion=" + << DocumentLogCreator::kCurrentVersion; // We can't rely on any existing derived files. Recreate them from scratch. // Currently happens if: // 1) This is a new log and we don't have derived files yet // 2) Client wanted us to force a regeneration. // 3) Log has some data loss, can't rely on existing derived data. - if (create_result.log_create_result.has_data_loss() && - initialize_stats != nullptr) { - ICING_LOG(WARNING) - << "Data loss in document log, regenerating derived files."; - initialize_stats->set_document_store_recovery_cause( - InitializeStatsProto::DATA_LOSS); - - if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) { - // Ground truth is partially lost. - initialize_stats->set_document_store_data_status( - InitializeStatsProto::PARTIAL_LOSS); - } else { - // Ground truth is completely lost. - initialize_stats->set_document_store_data_status( - InitializeStatsProto::COMPLETE_LOSS); - } - } - std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); libtextclassifier3::Status status = RegenerateDerivedFiles(force_recovery_and_revalidate_documents); if (initialize_stats != nullptr && - (force_recovery_and_revalidate_documents || - create_result.log_create_result.has_data_loss())) { + recovery_cause != InitializeStatsProto::NONE) { // Only consider it a recovery if the client forced a recovery or there // was data loss. Otherwise, this could just be the first time we're // initializing and generating derived files. initialize_stats->set_document_store_recovery_latency_ms( document_recovery_timer->GetElapsedMilliseconds()); + initialize_stats->set_document_store_recovery_cause(recovery_cause); + initialize_stats->set_document_store_data_status( + GetDataStatus(create_result.log_create_result.data_loss)); } if (!status.ok()) { ICING_LOG(ERROR) @@ -282,13 +298,13 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( } } else { if (!InitializeExistingDerivedFiles().ok()) { - ICING_VLOG(1) + ICING_LOG(WARNING) << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for DocumentStore."; std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); libtextclassifier3::Status status = RegenerateDerivedFiles( - /*force_recovery_and_revalidate_documents*/ false); - if (initialize_stats != nullptr && num_documents() > 0) { + /*force_recovery_and_revalidate_documents=*/false); + if (initialize_stats != nullptr) { initialize_stats->set_document_store_recovery_cause( InitializeStatsProto::IO_ERROR); initialize_stats->set_document_store_recovery_latency_ms( @@ -415,7 +431,19 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( // Iterates through document log auto iterator = document_log_->GetIterator(); auto iterator_status = iterator.Advance(); + libtextclassifier3::StatusOr<int64_t> element_size = + document_log_->GetElementsFileSize(); + libtextclassifier3::StatusOr<int64_t> disk_usage = + document_log_->GetDiskUsage(); + if (element_size.ok() && disk_usage.ok()) { + ICING_VLOG(1) << "Starting recovery of document store. Document store " + "elements file size:" + << element_size.ValueOrDie() + << ", disk usage=" << disk_usage.ValueOrDie(); + } while (iterator_status.ok()) { + ICING_VLOG(2) << "Attempting to read document at offset=" + << iterator.GetOffset(); libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or = document_log_->ReadProto(iterator.GetOffset()); @@ -530,7 +558,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). document_key_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_); @@ -540,7 +568,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { return status; } - // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN + // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. auto document_key_mapper_or = KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize); @@ -556,7 +584,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() { libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). document_id_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete( *filesystem_, MakeDocumentIdMapperFilename(base_dir_)); @@ -565,7 +593,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() { << "Failed to delete old document_id mapper"; return status; } - // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN + // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. auto document_id_mapper_or = FileBackedVector<int64_t>::Create( *filesystem_, MakeDocumentIdMapperFilename(base_dir_), @@ -618,7 +646,7 @@ libtextclassifier3::Status DocumentStore::ResetFilterCache() { libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). namespace_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete( *filesystem_, MakeNamespaceMapperFilename(base_dir_)); @@ -638,7 +666,7 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() { libtextclassifier3::Status DocumentStore::ResetCorpusMapper() { // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset(). corpus_mapper_.reset(); - // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR + // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR // that can support error logging. libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete( *filesystem_, MakeCorpusMapperFilename(base_dir_)); @@ -1749,5 +1777,63 @@ libtextclassifier3::Status DocumentStore::SetUsageScores( return usage_store_->SetUsageScores(document_id, usage_scores); } +libtextclassifier3::StatusOr< + google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> +DocumentStore::CollectCorpusInfo() const { + google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> + corpus_info; + libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or = + schema_store_->GetSchema(); + if (!schema_proto_or.ok()) { + return corpus_info; + } + // Maps from CorpusId to the corresponding protocol buffer in the result. + std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map; + std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace = + namespace_mapper_->GetValuesToKeys(); + const SchemaProto* schema_proto = schema_proto_or.ValueOrDie(); + for (DocumentId document_id = 0; document_id < filter_cache_->num_elements(); + ++document_id) { + if (!InternalDoesDocumentExist(document_id)) { + continue; + } + ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data, + filter_cache_->Get(document_id)); + ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data, + score_cache_->Get(document_id)); + const std::string& name_space = + namespace_id_to_namespace[filter_data->namespace_id()]; + const std::string& schema = + schema_proto->types()[filter_data->schema_type_id()].schema_type(); + auto iter = info_map.find(score_data->corpus_id()); + if (iter == info_map.end()) { + DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add(); + entry->set_namespace_(name_space); + entry->set_schema(schema); + iter = info_map.insert({score_data->corpus_id(), entry}).first; + } + iter->second->set_total_documents(iter->second->total_documents() + 1); + iter->second->set_total_token(iter->second->total_token() + + score_data->length_in_tokens()); + } + return corpus_info; +} + +libtextclassifier3::StatusOr<DocumentDebugInfoProto> +DocumentStore::GetDebugInfo(int verbosity) const { + DocumentDebugInfoProto debug_info; + *debug_info.mutable_document_storage_info() = GetStorageInfo(); + ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum()); + debug_info.set_crc(crc.Get()); + if (verbosity > 0) { + ICING_ASSIGN_OR_RETURN(google::protobuf::RepeatedPtrField< + DocumentDebugInfoProto::CorpusInfo> + corpus_info, + CollectCorpusInfo()); + *debug_info.mutable_corpus_info() = std::move(corpus_info); + } + return debug_info; +} + } // namespace lib } // namespace icing diff --git a/icing/store/document-store.h b/icing/store/document-store.h index c85c989..e6d2e5c 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -27,6 +27,7 @@ #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/portable-file-backed-proto-log.h" +#include "icing/proto/debug.pb.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/logging.pb.h" @@ -422,6 +423,17 @@ class DocumentStore { // INTERNAL_ERROR on compute error libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; + // Get debug information for the document store. + // verbosity <= 0, simplest debug information + // verbosity > 0, also return the total number of documents and tokens in each + // (namespace, schema type) pair. + // + // Returns: + // DocumentDebugInfoProto on success + // INTERNAL_ERROR on IO errors, crc compute error + libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo( + int verbosity) const; + private: // Use DocumentStore::Create() to instantiate. DocumentStore(const Filesystem* filesystem, std::string_view base_dir, @@ -696,6 +708,13 @@ class DocumentStore { // the document_id_mapper somehow became larger than the filter cache. DocumentStorageInfoProto CalculateDocumentStatusCounts( DocumentStorageInfoProto storage_info) const; + + // Returns: + // - on success, a RepeatedPtrField for CorpusInfo collected. + // - OUT_OF_RANGE, this should never happen. + libtextclassifier3::StatusOr<google::protobuf::RepeatedPtrField< + DocumentDebugInfoProto::CorpusInfo>> + CollectCorpusInfo() const; }; } // namespace lib diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc index 77da928..fc3fd9d 100644 --- a/icing/store/document-store_benchmark.cc +++ b/icing/store/document-store_benchmark.cc @@ -64,13 +64,13 @@ namespace lib { namespace { -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; class DestructibleDirectory { public: diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index a506eea..a30b4e4 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -29,7 +29,6 @@ #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/file/mock-filesystem.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" #include "icing/proto/document.pb.h" @@ -45,6 +44,7 @@ #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -85,16 +85,16 @@ const NamespaceStorageInfoProto& GetNamespaceStorageInfo( return std::move(NamespaceStorageInfoProto()); } -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; -constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = - StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN = + StringIndexingConfig::TokenizerType::PLAIN; -constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY; -constexpr PropertyConfigProto_DataType_Code TYPE_INT = - PropertyConfigProto_DataType_Code_INT64; +constexpr PropertyConfigProto::DataType::Code TYPE_INT = + PropertyConfigProto::DataType::INT64; UsageReport CreateUsageReport(std::string name_space, std::string uri, int64 timestamp_ms, @@ -3170,15 +3170,6 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); } -// TODO(b/185845269) Re-enable this test by copying over a full valid set of -// document store files. Right now this test only includes the score_cache and -// the document store header. -// -// This causes a problem now because this cl changes behavior to not consider an -// InitializeExistingDerivedFiles failure to be a recovery if there is nothing -// to recover because the doocument store is empty. -#define DISABLE_BACKWARDS_COMPAT_TEST -#ifndef DISABLE_BACKWARDS_COMPAT_TEST TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // The directory testdata/score_cache_without_length_in_tokens/document_store // contains only the scoring_cache and the document_store_header (holding the @@ -3194,29 +3185,26 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { // Get src files std::string document_store_without_length_in_tokens; - if (IsAndroidPlatform() || IsIosPlatform()) { + if (IsAndroidArm() || IsIosPlatform()) { document_store_without_length_in_tokens = GetTestFilePath( "icing/testdata/score_cache_without_length_in_tokens/" "document_store_android_ios_compatible"); + } else if (IsAndroidX86()) { + document_store_without_length_in_tokens = GetTestFilePath( + "icing/testdata/score_cache_without_length_in_tokens/" + "document_store_android_x86"); } else { document_store_without_length_in_tokens = GetTestFilePath( "icing/testdata/score_cache_without_length_in_tokens/" "document_store"); } - std::vector<std::string> document_store_files; Filesystem filesystem; - filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(), - &document_store_files); - - ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens - << ' ' << document_store_files.size(); - for (size_t i = 0; i != document_store_files.size(); i++) { - std::string src = absl_ports::StrCat( - document_store_without_length_in_tokens, "/", document_store_files[i]); - std::string dst = - absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]); - ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true); - } + ICING_LOG(INFO) << "Copying files " + << document_store_without_length_in_tokens; + ASSERT_THAT( + filesystem.CopyDirectory(document_store_without_length_in_tokens.c_str(), + document_store_dir_.c_str(), /*recursive=*/true), + true); InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( @@ -3227,12 +3215,11 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - // The store_cache trigger regeneration because its element size is - // inconsistent: expected 20 (current new size), actual 12 (as per the v0 - // score_cache). - EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause()); + // The document log is using the legacy v0 format so that a migration is + // needed, which will also trigger regeneration. + EXPECT_EQ(initialize_stats.document_store_recovery_cause(), + InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); } -#endif // DISABLE_BACKWARDS_COMPAT_TEST TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) { ICING_ASSERT_OK_AND_ASSIGN( @@ -3422,18 +3409,22 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { { // Create the document store the second time and force recovery + InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create( - &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true)); + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true, + &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); // Ensure that the type id of the email document has been correctly updated. ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data, doc_store->GetDocumentFilterData(docid)); - ASSERT_THAT(filter_data.schema_type_id(), Eq(1)); + EXPECT_THAT(filter_data.schema_type_id(), Eq(1)); + EXPECT_THAT(initialize_stats.document_store_recovery_cause(), + Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC)); } } @@ -3841,7 +3832,8 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { // Check that we didn't lose anything. A migration also doesn't technically // count as a recovery. EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); - EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause()); + EXPECT_EQ(initialize_stats.document_store_recovery_cause(), + InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT); // Document 1 and 3 were put normally, and document 2 was deleted in our // testdata files. @@ -3864,6 +3856,164 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { } #endif // DISABLE_BACKWARDS_COMPAT_TEST +TEST_F(DocumentStoreTest, GetDebugInfo) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + std::string schema_store_dir = schema_store_dir_ + "_custom"; + filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); + + ICING_ASSERT_OK(schema_store->SetSchema(schema)); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "aa bb cc") + .AddStringProperty("body", "dd ee") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK(document_store->Put(document1, 5)); + + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "email/2") + .SetSchema("email") + .AddStringProperty("subject", "aa bb") + .AddStringProperty("body", "cc") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK(document_store->Put(document2, 3)); + + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "email/3") + .SetSchema("email") + .AddStringProperty("subject", "aa") + .AddStringProperty("body", "") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK(document_store->Put(document3, 1)); + + DocumentProto document4 = DocumentBuilder() + .SetKey("namespace1", "person/1") + .SetSchema("person") + .AddStringProperty("name", "test test") + .SetCreationTimestampMs(1) + .Build(); + ICING_ASSERT_OK(document_store->Put(document4, 2)); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out1, + document_store->GetDebugInfo(/*verbosity=*/1)); + EXPECT_THAT(out1.crc(), Gt(0)); + EXPECT_THAT(out1.document_storage_info().num_alive_documents(), Eq(4)); + EXPECT_THAT(out1.document_storage_info().num_deleted_documents(), Eq(0)); + EXPECT_THAT(out1.document_storage_info().num_expired_documents(), Eq(0)); + + DocumentDebugInfoProto::CorpusInfo info1, info2, info3; + info1.set_namespace_("namespace1"); + info1.set_schema("email"); + info1.set_total_documents(1); // document1 + info1.set_total_token(5); + + info2.set_namespace_("namespace2"); + info2.set_schema("email"); + info2.set_total_documents(2); // document2 and document3 + info2.set_total_token(4); // 3 + 1 + + info3.set_namespace_("namespace1"); + info3.set_schema("person"); + info3.set_total_documents(1); // document4 + info3.set_total_token(2); + + EXPECT_THAT(out1.corpus_info(), + UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2), + EqualsProto(info3))); + + // Delete document3. + ICING_ASSERT_OK(document_store->Delete("namespace2", "email/3")); + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out2, + document_store->GetDebugInfo(/*verbosity=*/1)); + EXPECT_THAT(out2.crc(), Gt(0)); + EXPECT_THAT(out2.crc(), Not(Eq(out1.crc()))); + EXPECT_THAT(out2.document_storage_info().num_alive_documents(), Eq(3)); + EXPECT_THAT(out2.document_storage_info().num_deleted_documents(), Eq(1)); + EXPECT_THAT(out2.document_storage_info().num_expired_documents(), Eq(0)); + info2.set_total_documents(1); // document2 + info2.set_total_token(3); + EXPECT_THAT(out2.corpus_info(), + UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2), + EqualsProto(info3))); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out3, + document_store->GetDebugInfo(/*verbosity=*/0)); + EXPECT_THAT(out3.corpus_info(), IsEmpty()); +} + +TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) { + std::string schema_store_dir = schema_store_dir_ + "_custom"; + filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out, + document_store->GetDebugInfo(/*verbosity=*/1)); + EXPECT_THAT(out.crc(), Gt(0)); + EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0)); + EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0)); + EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0)); + EXPECT_THAT(out.corpus_info(), IsEmpty()); +} + +TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out, + document_store->GetDebugInfo(/*verbosity=*/1)); + EXPECT_THAT(out.crc(), Gt(0)); + EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0)); + EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0)); + EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0)); + EXPECT_THAT(out.corpus_info(), IsEmpty()); +} + } // namespace } // namespace lib diff --git a/icing/store/namespace-checker-impl.h b/icing/store/namespace-checker-impl.h new file mode 100644 index 0000000..bcd0643 --- /dev/null +++ b/icing/store/namespace-checker-impl.h @@ -0,0 +1,51 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_STORE_NAMESPACE_CHECKER_IMPL_H_ +#define ICING_STORE_NAMESPACE_CHECKER_IMPL_H_ + +#include "icing/store/document-id.h" +#include "icing/store/document-store.h" +#include "icing/store/namespace-checker.h" +#include "icing/store/namespace-id.h" + +namespace icing { +namespace lib { + +class NamespaceCheckerImpl : public NamespaceChecker { + public: + explicit NamespaceCheckerImpl( + const DocumentStore* document_store, + std::unordered_set<NamespaceId> target_namespace_ids) + : document_store_(*document_store), + target_namespace_ids_(std::move(target_namespace_ids)) {} + + bool BelongsToTargetNamespaces(DocumentId document_id) const override { + if (target_namespace_ids_.empty()) { + return true; + } + auto document_filter_data_or_ = + document_store_.GetDocumentFilterData(document_id); + return document_filter_data_or_.ok() && + target_namespace_ids_.count( + document_filter_data_or_.ValueOrDie().namespace_id())> 0; + } + const DocumentStore& document_store_; + std::unordered_set<NamespaceId> target_namespace_ids_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_STORE_NAMESPACE_CHECKER_IMPL_H_
\ No newline at end of file diff --git a/icing/store/namespace-checker.h b/icing/store/namespace-checker.h new file mode 100644 index 0000000..8812ab1 --- /dev/null +++ b/icing/store/namespace-checker.h @@ -0,0 +1,42 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_STORE_NAMESPACE_CHECKER_H_ +#define ICING_STORE_NAMESPACE_CHECKER_H_ + +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { + +class NamespaceChecker { + public: + virtual ~NamespaceChecker() = default; + + // Check whether the given document id is belongs to the target namespaces. + // Returns: + // On success, + // - true: the given document id belongs to the target namespaces + // - false: the given document id doesn't belong to the target namespaces + // OUT_OF_RANGE if document_id is negative or exceeds previously seen + // DocumentIds + // NOT_FOUND if the document or the filter data is not found + // INTERNAL_ERROR on all other errors + virtual bool BelongsToTargetNamespaces(DocumentId document_id) const = 0; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_STORE_NAMESPACE_CHECKER_H_ diff --git a/icing/testing/always-true-namespace-checker-impl.h b/icing/testing/always-true-namespace-checker-impl.h new file mode 100644 index 0000000..f7744b6 --- /dev/null +++ b/icing/testing/always-true-namespace-checker-impl.h @@ -0,0 +1,34 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_ +#define ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_ + +#include "icing/store/document-id.h" +#include "icing/store/namespace-checker.h" + +namespace icing { +namespace lib { + +class AlwaysTrueNamespaceCheckerImpl : public NamespaceChecker { + public: + bool BelongsToTargetNamespaces(DocumentId document_id) const override { + return true; + } +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_
\ No newline at end of file diff --git a/icing/helpers/icu/icu-data-file-helper.cc b/icing/testing/icu-data-file-helper.cc index 6607c40..aaeb738 100644 --- a/icing/helpers/icu/icu-data-file-helper.cc +++ b/icing/testing/icu-data-file-helper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/helpers/icu/icu-data-file-helper.h" +#include "icing/testing/icu-data-file-helper.h" #include <sys/mman.h> diff --git a/icing/helpers/icu/icu-data-file-helper.h b/icing/testing/icu-data-file-helper.h index 90f5bc7..d0276e7 100644 --- a/icing/helpers/icu/icu-data-file-helper.h +++ b/icing/testing/icu-data-file-helper.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER -#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER +#ifndef ICING_TESTING_ICU_DATA_FILE_HELPER +#define ICING_TESTING_ICU_DATA_FILE_HELPER #include "icing/text_classifier/lib3/utils/base/status.h" @@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile( } // namespace lib } // namespace icing -#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER +#endif // ICING_TESTING_ICU_DATA_FILE_HELPER diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h index 3165bf6..fd8d87b 100644 --- a/icing/testing/random-string.h +++ b/icing/testing/random-string.h @@ -15,6 +15,7 @@ #ifndef ICING_TESTING_RANDOM_STRING_H_ #define ICING_TESTING_RANDOM_STRING_H_ +#include <algorithm> #include <random> #include <string> diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index 598ede7..8e0f789 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -59,34 +59,35 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { ~IcuLanguageSegmenterIterator() { ubrk_close(break_iterator_); - utext_close(&u_text_); + utext_close(u_text_); } // Advances to the next term. Returns false if it has reached the end. bool Advance() override { - // Prerequisite check - if (term_end_index_exclusive_ == UBRK_DONE) { - return false; - } + while (true) { + // Prerequisite check + if (term_end_index_exclusive_ == UBRK_DONE) { + return false; + } - if (term_end_index_exclusive_ == 0) { - // First Advance() call - term_start_index_ = ubrk_first(break_iterator_); - } else { - term_start_index_ = term_end_index_exclusive_; - } - term_end_index_exclusive_ = ubrk_next(break_iterator_); + if (term_end_index_exclusive_ == 0) { + // First Advance() call + term_start_index_ = ubrk_first(break_iterator_); + } else { + term_start_index_ = term_end_index_exclusive_; + } + term_end_index_exclusive_ = ubrk_next(break_iterator_); - // Reached the end - if (term_end_index_exclusive_ == UBRK_DONE) { - MarkAsDone(); - return false; - } + // Reached the end + if (term_end_index_exclusive_ == UBRK_DONE) { + MarkAsDone(); + return false; + } - if (!IsValidSegment()) { - return Advance(); + if (IsValidSegment()) { + return true; + } } - return true; } // Returns the current term. It can be called only when Advance() returns @@ -253,7 +254,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { : break_iterator_(nullptr), text_(text), locale_(locale), - u_text_(UTEXT_INITIALIZER), + u_text_(nullptr), offset_iterator_(text), term_start_index_(0), term_end_index_exclusive_(0) {} @@ -261,10 +262,13 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Returns true on success bool Initialize() { UErrorCode status = U_ZERO_ERROR; - utext_openUTF8(&u_text_, text_.data(), text_.length(), &status); + u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status); + if (u_text_ == nullptr) { + return false; + } break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr, /*textLength=*/0, &status); - ubrk_setUText(break_iterator_, &u_text_, &status); + ubrk_setUText(break_iterator_, u_text_, &status); return !U_FAILURE(status); } @@ -322,8 +326,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { std::string_view locale_; // A thin wrapper around the input UTF8 text, needed by break_iterator_. - // utext_close() must be called after using. - UText u_text_; + // Allocated by calling utext_openUtf8() and freed by calling utext_close(). + UText* u_text_; // Offset iterator. This iterator is not guaranteed to point to any particular // character, but is guaranteed to point to a valid UTF character sequence. diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index 3090087..fe0b96e 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -21,8 +21,8 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc index d293581..3aff45c 100644 --- a/icing/tokenization/language-segmenter-iterator_test.cc +++ b/icing/tokenization/language-segmenter-iterator_test.cc @@ -15,9 +15,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc index bd86169..6f7d4df 100644 --- a/icing/tokenization/language-segmenter_benchmark.cc +++ b/icing/tokenization/language-segmenter_benchmark.cc @@ -14,8 +14,8 @@ #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc index 13fe550..7a1949f 100644 --- a/icing/tokenization/plain-tokenizer.cc +++ b/icing/tokenization/plain-tokenizer.cc @@ -66,9 +66,9 @@ class PlainTokenIterator : public Tokenizer::Iterator { Token GetToken() const override { if (current_term_.empty()) { - return Token(Token::INVALID); + return Token(Token::Type::INVALID); } - return Token(Token::REGULAR, current_term_); + return Token(Token::Type::REGULAR, current_term_); } libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart() @@ -81,8 +81,8 @@ class PlainTokenIterator : public Tokenizer::Iterator { return base_iterator_->CalculateTermEndExclusive(); } - bool ResetToTokenAfter(int32_t offset) override { - if (!base_iterator_->ResetToTermStartingAfterUtf32(offset).ok()) { + bool ResetToTokenStartingAfter(int32_t utf32_offset) override { + if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) { return false; } current_term_ = base_iterator_->GetTerm(); @@ -93,15 +93,17 @@ class PlainTokenIterator : public Tokenizer::Iterator { return true; } - bool ResetToTokenBefore(int32_t offset) override { + bool ResetToTokenEndingBefore(int32_t utf32_offset) override { ICING_ASSIGN_OR_RETURN( - offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false); + utf32_offset, + base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false); current_term_ = base_iterator_->GetTerm(); while (!IsValidTerm(current_term_)) { // Haven't found a valid term yet. Retrieve the term prior to this one // from the segmenter. ICING_ASSIGN_OR_RETURN( - offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false); + utf32_offset, + base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false); current_term_ = base_iterator_->GetTerm(); } return true; diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc index 7490bfa..c48b51e 100644 --- a/icing/tokenization/plain-tokenizer_test.cc +++ b/icing/tokenization/plain-tokenizer_test.cc @@ -18,9 +18,9 @@ #include "gmock/gmock.h" #include "icing/absl_ports/str_cat.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" @@ -68,26 +68,27 @@ TEST_F(PlainTokenizerTest, Simple) { EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("Hello World"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); EXPECT_THAT( plain_tokenizer->TokenizeAll( "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " "Duis efficitur iaculis auctor."), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"), - EqualsToken(Token::REGULAR, "ipsum"), - EqualsToken(Token::REGULAR, "dolor"), - EqualsToken(Token::REGULAR, "sit"), - EqualsToken(Token::REGULAR, "amet"), - EqualsToken(Token::REGULAR, "consectetur"), - EqualsToken(Token::REGULAR, "adipiscing"), - EqualsToken(Token::REGULAR, "elit"), - EqualsToken(Token::REGULAR, "Duis"), - EqualsToken(Token::REGULAR, "efficitur"), - EqualsToken(Token::REGULAR, "iaculis"), - EqualsToken(Token::REGULAR, "auctor")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Lorem"), + EqualsToken(Token::Type::REGULAR, "ipsum"), + EqualsToken(Token::Type::REGULAR, "dolor"), + EqualsToken(Token::Type::REGULAR, "sit"), + EqualsToken(Token::Type::REGULAR, "amet"), + EqualsToken(Token::Type::REGULAR, "consectetur"), + EqualsToken(Token::Type::REGULAR, "adipiscing"), + EqualsToken(Token::Type::REGULAR, "elit"), + EqualsToken(Token::Type::REGULAR, "Duis"), + EqualsToken(Token::Type::REGULAR, "efficitur"), + EqualsToken(Token::Type::REGULAR, "iaculis"), + EqualsToken(Token::Type::REGULAR, "auctor")))); } TEST_F(PlainTokenizerTest, Whitespace) { @@ -107,16 +108,18 @@ TEST_F(PlainTokenizerTest, Whitespace) { // 0x0009 is horizontal tab, considered as a whitespace std::string text_with_horizontal_tab = absl_ports::StrCat("Hello", UCharToString(0x0009), "World"); - EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll(text_with_horizontal_tab), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); // 0x000B is vertical tab, considered as a whitespace std::string text_with_vertical_tab = absl_ports::StrCat("Hello", UCharToString(0x000B), "World"); - EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll(text_with_vertical_tab), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); } TEST_F(PlainTokenizerTest, Punctuation) { @@ -131,38 +134,39 @@ TEST_F(PlainTokenizerTest, Punctuation) { language_segmenter.get())); // Half-width punctuation marks are filtered out. - EXPECT_THAT(plain_tokenizer->TokenizeAll( - "Hello, World! Hello: World. \"Hello\" World?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World"), - EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World"), - EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll( + "Hello, World! Hello: World. \"Hello\" World?"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World"), + EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World"), + EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); // Full-width punctuation marks are filtered out. std::vector<std::string_view> exp_tokens; if (IsCfStringTokenization()) { EXPECT_THAT( plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你"), - EqualsToken(Token::REGULAR, "好"), - EqualsToken(Token::REGULAR, "世界"), - EqualsToken(Token::REGULAR, "你"), - EqualsToken(Token::REGULAR, "好"), - EqualsToken(Token::REGULAR, "世界"), - EqualsToken(Token::REGULAR, "你"), - EqualsToken(Token::REGULAR, "好"), - EqualsToken(Token::REGULAR, "世界")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你"), + EqualsToken(Token::Type::REGULAR, "好"), + EqualsToken(Token::Type::REGULAR, "世界"), + EqualsToken(Token::Type::REGULAR, "你"), + EqualsToken(Token::Type::REGULAR, "好"), + EqualsToken(Token::Type::REGULAR, "世界"), + EqualsToken(Token::Type::REGULAR, "你"), + EqualsToken(Token::Type::REGULAR, "好"), + EqualsToken(Token::Type::REGULAR, "世界")))); } else { EXPECT_THAT( plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"), - EqualsToken(Token::REGULAR, "世界"), - EqualsToken(Token::REGULAR, "你好"), - EqualsToken(Token::REGULAR, "世界"), - EqualsToken(Token::REGULAR, "你好"), - EqualsToken(Token::REGULAR, "世界")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你好"), + EqualsToken(Token::Type::REGULAR, "世界"), + EqualsToken(Token::Type::REGULAR, "你好"), + EqualsToken(Token::Type::REGULAR, "世界"), + EqualsToken(Token::Type::REGULAR, "你好"), + EqualsToken(Token::Type::REGULAR, "世界")))); } } @@ -180,14 +184,16 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) { // Right now we don't have special logic for these characters, just output // them as tokens. - EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"), - EqualsToken(Token::REGULAR, "+"), - EqualsToken(Token::REGULAR, "1")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("1+1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "1"), + EqualsToken(Token::Type::REGULAR, "+"), + EqualsToken(Token::Type::REGULAR, "1")))); - EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"), - EqualsToken(Token::REGULAR, "50")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("$50"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "$"), + EqualsToken(Token::Type::REGULAR, "50")))); } TEST_F(PlainTokenizerTest, CJKT) { @@ -203,12 +209,13 @@ TEST_F(PlainTokenizerTest, CJKT) { tokenizer_factory::CreateIndexingTokenizer( StringIndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); - EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"), - EqualsToken(Token::REGULAR, "每天"), - EqualsToken(Token::REGULAR, "走路"), - EqualsToken(Token::REGULAR, "去"), - EqualsToken(Token::REGULAR, "上班")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("我每天走路去上班。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "我"), + EqualsToken(Token::Type::REGULAR, "每天"), + EqualsToken(Token::Type::REGULAR, "走路"), + EqualsToken(Token::Type::REGULAR, "去"), + EqualsToken(Token::Type::REGULAR, "上班")))); // Japanese options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE, jni_cache_.get()); @@ -220,41 +227,44 @@ TEST_F(PlainTokenizerTest, CJKT) { StringIndexingConfig::TokenizerType::PLAIN, language_segmenter.get())); if (IsCfStringTokenization()) { - EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"), - EqualsToken(Token::REGULAR, "は"), - EqualsToken(Token::REGULAR, "毎日"), - EqualsToken(Token::REGULAR, "仕事"), - EqualsToken(Token::REGULAR, "に"), - EqualsToken(Token::REGULAR, "歩い"), - EqualsToken(Token::REGULAR, "て"), - EqualsToken(Token::REGULAR, "い"), - EqualsToken(Token::REGULAR, "ます")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"), + EqualsToken(Token::Type::REGULAR, "は"), + EqualsToken(Token::Type::REGULAR, "毎日"), + EqualsToken(Token::Type::REGULAR, "仕事"), + EqualsToken(Token::Type::REGULAR, "に"), + EqualsToken(Token::Type::REGULAR, "歩い"), + EqualsToken(Token::Type::REGULAR, "て"), + EqualsToken(Token::Type::REGULAR, "い"), + EqualsToken(Token::Type::REGULAR, "ます")))); } else { - EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"), - EqualsToken(Token::REGULAR, "は"), - EqualsToken(Token::REGULAR, "毎日"), - EqualsToken(Token::REGULAR, "仕事"), - EqualsToken(Token::REGULAR, "に"), - EqualsToken(Token::REGULAR, "歩"), - EqualsToken(Token::REGULAR, "い"), - EqualsToken(Token::REGULAR, "てい"), - EqualsToken(Token::REGULAR, "ます")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"), + EqualsToken(Token::Type::REGULAR, "は"), + EqualsToken(Token::Type::REGULAR, "毎日"), + EqualsToken(Token::Type::REGULAR, "仕事"), + EqualsToken(Token::Type::REGULAR, "に"), + EqualsToken(Token::Type::REGULAR, "歩"), + EqualsToken(Token::Type::REGULAR, "い"), + EqualsToken(Token::Type::REGULAR, "てい"), + EqualsToken(Token::Type::REGULAR, "ます")))); } // Khmer - EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"), - EqualsToken(Token::REGULAR, "ដើរទៅ"), - EqualsToken(Token::REGULAR, "ធ្វើការ"), - EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ")))); - // Korean EXPECT_THAT( - plain_tokenizer->TokenizeAll("나는 매일 출근합니다."), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"), - EqualsToken(Token::REGULAR, "매일"), - EqualsToken(Token::REGULAR, "출근합니다")))); + plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ញុំ"), + EqualsToken(Token::Type::REGULAR, "ដើរទៅ"), + EqualsToken(Token::Type::REGULAR, "ធ្វើការ"), + EqualsToken(Token::Type::REGULAR, "រាល់ថ្ងៃ")))); + // Korean + EXPECT_THAT(plain_tokenizer->TokenizeAll("나는 매일 출근합니다."), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::REGULAR, "나는"), + EqualsToken(Token::Type::REGULAR, "매일"), + EqualsToken(Token::Type::REGULAR, "출근합니다")))); // Thai // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups). @@ -264,23 +274,24 @@ TEST_F(PlainTokenizerTest, CJKT) { std::vector<Token> tokens, plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน")); - EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"), - EqualsToken(Token::REGULAR, "เดิน"), - EqualsToken(Token::REGULAR, "ไป"), - EqualsToken(Token::REGULAR, "ทำงาน"), - EqualsToken(Token::REGULAR, "ทุกวัน"))); + EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"), + EqualsToken(Token::Type::REGULAR, "เดิน"), + EqualsToken(Token::Type::REGULAR, "ไป"), + EqualsToken(Token::Type::REGULAR, "ทำงาน"), + EqualsToken(Token::Type::REGULAR, "ทุกวัน"))); } else { - EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"), - EqualsToken(Token::REGULAR, "เดิน"), - EqualsToken(Token::REGULAR, "ไป"), - EqualsToken(Token::REGULAR, "ทำงาน"), - EqualsToken(Token::REGULAR, "ทุก"), - EqualsToken(Token::REGULAR, "วัน")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"), + EqualsToken(Token::Type::REGULAR, "เดิน"), + EqualsToken(Token::Type::REGULAR, "ไป"), + EqualsToken(Token::Type::REGULAR, "ทำงาน"), + EqualsToken(Token::Type::REGULAR, "ทุก"), + EqualsToken(Token::Type::REGULAR, "วัน")))); } } -TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { +TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) { language_segmenter_factory::SegmenterOptions options(ULOC_US, jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( @@ -294,13 +305,13 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { constexpr std::string_view kText = "f b"; auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); - EXPECT_TRUE(iterator->ResetToTokenAfter(0)); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b")); + EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0)); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "b")); - EXPECT_FALSE(iterator->ResetToTokenAfter(2)); + EXPECT_FALSE(iterator->ResetToTokenStartingAfter(2)); } -TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { +TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) { language_segmenter_factory::SegmenterOptions options(ULOC_US, jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( @@ -314,13 +325,13 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { constexpr std::string_view kText = "f b"; auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); - EXPECT_TRUE(iterator->ResetToTokenBefore(2)); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f")); + EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2)); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "f")); - EXPECT_FALSE(iterator->ResetToTokenBefore(0)); + EXPECT_FALSE(iterator->ResetToTokenEndingBefore(0)); } -TEST_F(PlainTokenizerTest, ResetToTokenAfter) { +TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) { language_segmenter_factory::SegmenterOptions options(ULOC_US, jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( @@ -332,11 +343,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) { language_segmenter.get())); constexpr std::string_view kText = " foo . bar baz.. bat "; - EXPECT_THAT(plain_tokenizer->TokenizeAll(kText), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"), - EqualsToken(Token::REGULAR, "bar"), - EqualsToken(Token::REGULAR, "baz"), - EqualsToken(Token::REGULAR, "bat")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll(kText), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"), + EqualsToken(Token::Type::REGULAR, "bar"), + EqualsToken(Token::Type::REGULAR, "baz"), + EqualsToken(Token::Type::REGULAR, "bat")))); std::vector<std::string> expected_text = { "foo", // 0: " foo . bar" "bar", // 1: "foo . bar " @@ -359,19 +371,19 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) { auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); EXPECT_TRUE(iterator->Advance()); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo")); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo")); for (int i = 0; i < kText.length(); ++i) { if (i < expected_text.size()) { - EXPECT_TRUE(iterator->ResetToTokenAfter(i)); + EXPECT_TRUE(iterator->ResetToTokenStartingAfter(i)); EXPECT_THAT(iterator->GetToken(), - EqualsToken(Token::REGULAR, expected_text[i])); + EqualsToken(Token::Type::REGULAR, expected_text[i])); } else { - EXPECT_FALSE(iterator->ResetToTokenAfter(i)); + EXPECT_FALSE(iterator->ResetToTokenStartingAfter(i)); } } } -TEST_F(PlainTokenizerTest, ResetToTokenBefore) { +TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) { language_segmenter_factory::SegmenterOptions options(ULOC_US, jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( @@ -383,11 +395,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenBefore) { language_segmenter.get())); constexpr std::string_view kText = " foo . bar baz.. bat "; - EXPECT_THAT(plain_tokenizer->TokenizeAll(kText), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"), - EqualsToken(Token::REGULAR, "bar"), - EqualsToken(Token::REGULAR, "baz"), - EqualsToken(Token::REGULAR, "bat")))); + EXPECT_THAT( + plain_tokenizer->TokenizeAll(kText), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"), + EqualsToken(Token::Type::REGULAR, "bar"), + EqualsToken(Token::Type::REGULAR, "baz"), + EqualsToken(Token::Type::REGULAR, "bat")))); std::vector<std::string> expected_text = { "bat", // 20: "baz.. bat " "baz", // 19: " baz.. bat" @@ -410,15 +423,16 @@ TEST_F(PlainTokenizerTest, ResetToTokenBefore) { auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); EXPECT_TRUE(iterator->Advance()); - EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo")); + EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo")); for (int i = kText.length() - 1; i >= 0; --i) { int expected_index = kText.length() - 1 - i; if (expected_index < expected_text.size()) { - EXPECT_TRUE(iterator->ResetToTokenBefore(i)); - EXPECT_THAT(iterator->GetToken(), - EqualsToken(Token::REGULAR, expected_text[expected_index])); + EXPECT_TRUE(iterator->ResetToTokenEndingBefore(i)); + EXPECT_THAT( + iterator->GetToken(), + EqualsToken(Token::Type::REGULAR, expected_text[expected_index])); } else { - EXPECT_FALSE(iterator->ResetToTokenBefore(i)); + EXPECT_FALSE(iterator->ResetToTokenEndingBefore(i)); } } } diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc index 2d461ee..8a27103 100644 --- a/icing/tokenization/raw-query-tokenizer.cc +++ b/icing/tokenization/raw-query-tokenizer.cc @@ -422,7 +422,7 @@ std::pair<TermType, std::string_view> GetTerm(std::string_view text, // and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no // valid token on its right. void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) { - if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) { + if (!tokens->empty() && tokens->back().type == Token::Type::QUERY_OR) { tokens->pop_back(); } } @@ -436,11 +436,11 @@ libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) { } Token::Type last_token_type = tokens->back().type; switch (last_token_type) { - case Token::REGULAR: - case Token::QUERY_RIGHT_PARENTHESES: - tokens->emplace_back(Token::QUERY_OR); + case Token::Type::REGULAR: + case Token::Type::QUERY_RIGHT_PARENTHESES: + tokens->emplace_back(Token::Type::QUERY_OR); break; - case Token::QUERY_OR: + case Token::Type::QUERY_OR: // Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2" break; default: @@ -481,21 +481,21 @@ libtextclassifier3::Status OutputToken(State new_state, GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME)); } } - tokens->emplace_back(Token::QUERY_PROPERTY, current_term); + tokens->emplace_back(Token::Type::QUERY_PROPERTY, current_term); } else { - tokens->emplace_back(Token::REGULAR, current_term); + tokens->emplace_back(Token::Type::REGULAR, current_term); } break; case LEFT_PARENTHESES: - tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES); + tokens->emplace_back(Token::Type::QUERY_LEFT_PARENTHESES); break; case RIGHT_PARENTHESES: // Ignores "OR" if it's followed by right parentheses. RemoveLastTokenIfOrOperator(tokens); - tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES); + tokens->emplace_back(Token::Type::QUERY_RIGHT_PARENTHESES); break; case EXCLUSION_OPERATOR: - tokens->emplace_back(Token::QUERY_EXCLUSION); + tokens->emplace_back(Token::Type::QUERY_EXCLUSION); break; case OR_OPERATOR: return OutputOrOperatorToken(tokens); @@ -648,7 +648,7 @@ class RawQueryTokenIterator : public Tokenizer::Iterator { Token GetToken() const override { if (current_ < 0 || current_ >= tokens_.size()) { - return Token(Token::INVALID); + return Token(Token::Type::INVALID); } return tokens_.at(current_); } diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index 500efa0..c6d981d 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -16,9 +16,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/tokenizer-factory.h" @@ -59,13 +59,15 @@ TEST_F(RawQueryTokenizerTest, Simple) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"), - EqualsToken(Token::REGULAR, "World")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("Hello World!"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "World")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"), - EqualsToken(Token::REGULAR, "WORLD")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("hElLo WORLD"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "hElLo"), + EqualsToken(Token::Type::REGULAR, "WORLD")))); } TEST_F(RawQueryTokenizerTest, Parentheses) { @@ -80,82 +82,82 @@ TEST_F(RawQueryTokenizerTest, Parentheses) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term3"), - EqualsToken(Token::REGULAR, "term4"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term3"), + EqualsToken(Token::Type::REGULAR, "term4"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("(term1)term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); - - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("(term1)-term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term2")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)-term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("(term1)OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, @@ -180,44 +182,49 @@ TEST_F(RawQueryTokenizerTest, Exclustion) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("-term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Exclusion operator is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("- term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // Exclusion operator is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1- term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2")))); // Exclusion operator is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // First exclusion operator is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("--term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1")))); // First "-" is exclusion operator, second is not and will be discarded. // In other words, exclusion only applies to the term right after it. - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("-term1-term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, @@ -249,73 +256,75 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Colon is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll(":term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // Colon is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Colon is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1:"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // property name can be a path EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"), - IsOkAndHolds( - ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"), - EqualsToken(Token::REGULAR, "hello")))); + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "email.title"), + EqualsToken(Token::Type::REGULAR, "hello")))); // The first colon ":" triggers property restriction, the second colon is used // as a word connector per ICU's rule // (https://unicode.org/reports/tr29/#Word_Boundaries). - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property:foo:bar"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"), - EqualsToken(Token::REGULAR, "foo:bar")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property"), + EqualsToken(Token::Type::REGULAR, "foo:bar")))); // Property restriction only applies to the term right after it. // Note: "term1:term2" is not a term but 2 terms because word connectors // don't apply to numbers and alphabets. - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:term1:term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1:term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:今天:天气"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::REGULAR, "天气")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:今天:天气"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::REGULAR, "天气")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:term1-"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1-"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1")))); // Multiple continuous colons will still be recognized as a property // restriction operator - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1::term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1::term1"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("property1:(term1)"), @@ -345,105 +354,109 @@ TEST_F(RawQueryTokenizerTest, OR) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1 OR term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); // Two continuous "OR"s are treated as one - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("(term1) OR term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); + raw_query_tokenizer->TokenizeAll("term1 OR OR term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1) OR term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Only "OR" (all in uppercase) is the operator EXPECT_THAT( raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "or"), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::REGULAR, "Or"), - EqualsToken(Token::REGULAR, "term3"), - EqualsToken(Token::REGULAR, "oR"), - EqualsToken(Token::REGULAR, "term4")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "or"), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::REGULAR, "Or"), + EqualsToken(Token::Type::REGULAR, "term3"), + EqualsToken(Token::Type::REGULAR, "oR"), + EqualsToken(Token::Type::REGULAR, "term4")))); // "OR" is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("OR term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // "OR" is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1 OR"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // "OR" is ignored EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term2"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("term1 OR-term2"), @@ -472,31 +485,31 @@ TEST_F(RawQueryTokenizerTest, CJKT) { if (IsCfStringTokenization()) { EXPECT_THAT( raw_query_tokenizer->TokenizeAll("-今天天气很好"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::REGULAR, "天气"), - EqualsToken(Token::REGULAR, "很"), - EqualsToken(Token::REGULAR, "好")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::REGULAR, "天气"), + EqualsToken(Token::Type::REGULAR, "很"), + EqualsToken(Token::Type::REGULAR, "好")))); } else { EXPECT_THAT( raw_query_tokenizer->TokenizeAll("-今天天气很好"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::REGULAR, "天气"), - EqualsToken(Token::REGULAR, "很好")))); + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::REGULAR, "天气"), + EqualsToken(Token::Type::REGULAR, "很好")))); } if (IsCfStringTokenization()) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"), - IsOkAndHolds( - ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "你"), - EqualsToken(Token::REGULAR, "好")))); + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "你"), + EqualsToken(Token::Type::REGULAR, "好")))); } else { EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"), - IsOkAndHolds( - ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "你好")))); + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "你好")))); } EXPECT_THAT( @@ -504,10 +517,11 @@ TEST_F(RawQueryTokenizerTest, CJKT) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("Characters in property name must all be ASCII"))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "ねこ")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("cat OR ねこ"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "cat"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "ねこ")))); EXPECT_THAT( raw_query_tokenizer->TokenizeAll("cat ORねこ"), @@ -543,40 +557,45 @@ TEST_F(RawQueryTokenizerTest, OtherChars) { language_segmenter.get())); // Comma is ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll(",term1, ,"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); // Exclusion operator and comma are ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("-,term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1")))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "term1")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("-term1,"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "term1")))); // Colon and comma are ignored - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"), - EqualsToken(Token::REGULAR, "term1")))); - EXPECT_THAT( - raw_query_tokenizer->TokenizeAll("property1:term1,term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"), - EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::REGULAR, "term2")))); + raw_query_tokenizer->TokenizeAll("property1:,term1"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "property1"), + EqualsToken(Token::Type::REGULAR, "term1")))); + + EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1,term2"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::QUERY_PROPERTY, "property1"), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::REGULAR, "term2")))); // This is a special case for OR, unknown chars are treated the same as // whitespaces before and after OR. - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"), - IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::REGULAR, "term2")))); + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("term1,OR,term2"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::REGULAR, "term2")))); } TEST_F(RawQueryTokenizerTest, Mix) { @@ -593,37 +612,38 @@ TEST_F(RawQueryTokenizerTest, Mix) { EXPECT_THAT(raw_query_tokenizer->TokenizeAll( "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"), IsOkAndHolds(ElementsAre( - EqualsToken(Token::REGULAR, "こんにちは"), - EqualsToken(Token::REGULAR, "good"), - EqualsToken(Token::REGULAR, "afternoon"), - EqualsToken(Token::QUERY_PROPERTY, "title"), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "ใน"), - EqualsToken(Token::REGULAR, "วันนี้"), - EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "B12"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::REGULAR, "こんにちは"), + EqualsToken(Token::Type::REGULAR, "good"), + EqualsToken(Token::Type::REGULAR, "afternoon"), + EqualsToken(Token::Type::QUERY_PROPERTY, "title"), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "ใน"), + EqualsToken(Token::Type::REGULAR, "วันนี้"), + EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "B12"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); } else { ICING_ASSERT_OK_AND_ASSIGN( std::vector<Token> tokens, raw_query_tokenizer->TokenizeAll( "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)")); - EXPECT_THAT(tokens, - ElementsAre(EqualsToken(Token::REGULAR, "こんにちは"), - EqualsToken(Token::REGULAR, "good"), - EqualsToken(Token::REGULAR, "afternoon"), - EqualsToken(Token::QUERY_PROPERTY, "title"), - EqualsToken(Token::REGULAR, "今天"), - EqualsToken(Token::QUERY_OR, ""), - EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::REGULAR, "ใน"), - EqualsToken(Token::REGULAR, "วัน"), - EqualsToken(Token::REGULAR, "นี้"), - EqualsToken(Token::QUERY_EXCLUSION, ""), - EqualsToken(Token::REGULAR, "B12"), - EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))); + EXPECT_THAT( + tokens, + ElementsAre(EqualsToken(Token::Type::REGULAR, "こんにちは"), + EqualsToken(Token::Type::REGULAR, "good"), + EqualsToken(Token::Type::REGULAR, "afternoon"), + EqualsToken(Token::Type::QUERY_PROPERTY, "title"), + EqualsToken(Token::Type::REGULAR, "今天"), + EqualsToken(Token::Type::QUERY_OR, ""), + EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "ใน"), + EqualsToken(Token::Type::REGULAR, "วัน"), + EqualsToken(Token::Type::REGULAR, "นี้"), + EqualsToken(Token::Type::QUERY_EXCLUSION, ""), + EqualsToken(Token::Type::REGULAR, "B12"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); } } diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index b936f2b..cb474c6 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -43,45 +43,46 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Advances to the next term. Returns false if it has reached the end. bool Advance() override { - // Prerequisite check - if (IsDone()) { - return false; - } + while (true) { + // Prerequisite check + if (IsDone()) { + return false; + } - if (term_end_exclusive_.utf16_index() == 0) { - int first = break_iterator_->First(); - if (!term_start_.MoveToUtf16(first)) { - // First is guaranteed to succeed and return a position within bonds. So - // the only possible failure could be an invalid sequence. Mark as DONE - // and return. + if (term_end_exclusive_.utf16_index() == 0) { + int first = break_iterator_->First(); + if (!term_start_.MoveToUtf16(first)) { + // First is guaranteed to succeed and return a position within bonds. + // So the only possible failure could be an invalid sequence. Mark as + // DONE and return. + MarkAsDone(); + return false; + } + } else { + term_start_ = term_end_exclusive_; + } + + int next_utf16_index_exclusive = break_iterator_->Next(); + // Reached the end + if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) { + MarkAsDone(); + return false; + } + if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) { + // next_utf16_index_exclusive is guaranteed to be within bonds thanks to + // the check for kDone above. So the only possible failure could be an + // invalid sequence. Mark as DONE and return. MarkAsDone(); return false; } - } else { - term_start_ = term_end_exclusive_; - } - - int next_utf16_index_exclusive = break_iterator_->Next(); - // Reached the end - if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) { - MarkAsDone(); - return false; - } - if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) { - // next_utf16_index_exclusive is guaranteed to be within bonds thanks to - // the check for kDone above. So the only possible failure could be an - // invalid sequence. Mark as DONE and return. - MarkAsDone(); - return false; - } - // Check if the current term is valid. We consider any term valid if its - // first character is valid. If it's not valid, then we need to advance to - // the next term. - if (IsValidTerm()) { - return true; + // Check if the current term is valid. We consider any term valid if its + // first character is valid. If it's not valid, then we need to advance to + // the next term. + if (IsValidTerm()) { + return true; + } } - return Advance(); } // Returns the current term. It can be called only when Advance() returns diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h index dda9efc..0c268be 100644 --- a/icing/tokenization/token.h +++ b/icing/tokenization/token.h @@ -21,11 +21,14 @@ namespace icing { namespace lib { struct Token { - enum Type { + enum class Type { // Common types REGULAR, // A token without special meanings, the value of it will be // indexed or searched directly + VERBATIM, // A token that should be indexed and searched without any + // modifications to the raw text + // Types only used in raw query QUERY_OR, // Indicates OR logic between its left and right tokens QUERY_EXCLUSION, // Indicates exclusion operation on next token diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc index 9b59acf..b2508f7 100644 --- a/icing/tokenization/tokenizer-factory.cc +++ b/icing/tokenization/tokenizer-factory.cc @@ -23,6 +23,7 @@ #include "icing/tokenization/plain-tokenizer.h" #include "icing/tokenization/raw-query-tokenizer.h" #include "icing/tokenization/tokenizer.h" +#include "icing/tokenization/verbatim-tokenizer.h" #include "icing/util/status-macros.h" namespace icing { @@ -38,6 +39,8 @@ CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type, switch (type) { case StringIndexingConfig::TokenizerType::PLAIN: return std::make_unique<PlainTokenizer>(lang_segmenter); + case StringIndexingConfig::TokenizerType::VERBATIM: + return std::make_unique<VerbatimTokenizer>(); case StringIndexingConfig::TokenizerType::NONE: [[fallthrough]]; default: diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h index b4f0c6e..2bc18cc 100644 --- a/icing/tokenization/tokenizer.h +++ b/icing/tokenization/tokenizer.h @@ -43,6 +43,7 @@ class Tokenizer { enum Type { // Index tokenizers PLAIN, // Used to tokenize plain text input + VERBATIM, // Used to tokenize the input text in verbatim // Query tokenizers RAW_QUERY, // Used to tokenize raw queries @@ -83,22 +84,26 @@ class Tokenizer { // offset. // Ex. // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); - // iterator.ResetToTokenAfter(4); + // iterator.ResetToTokenStartingAfter(4); // // The first full token starting after position 4 (the 'b' in "bar") is // // "baz". // PrintToken(iterator.GetToken()); // prints "baz" - virtual bool ResetToTokenAfter(int32_t offset) { return false; } + virtual bool ResetToTokenStartingAfter(int32_t utf32_offset) { + return false; + } // Sets the tokenizer to point at the first token that *ends* *before* // offset. Returns false if there are no valid tokens ending // before offset. // Ex. // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); - // iterator.ResetToTokenBefore(4); + // iterator.ResetToTokenEndingBefore(4); // // The first full token ending before position 4 (the 'b' in "bar") is // // "foo". // PrintToken(iterator.GetToken()); // prints "foo" - virtual bool ResetToTokenBefore(int32_t offset) { return false; } + virtual bool ResetToTokenEndingBefore(int32_t utf32_offset) { + return false; + } virtual bool ResetToStart() { return false; } }; diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc new file mode 100644 index 0000000..0d3a320 --- /dev/null +++ b/icing/tokenization/verbatim-tokenizer.cc @@ -0,0 +1,139 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/tokenization/verbatim-tokenizer.h" + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/util/character-iterator.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +class VerbatimTokenIterator : public Tokenizer::Iterator { + public: + explicit VerbatimTokenIterator(std::string_view text) + : term_(std::move(text)) {} + + bool Advance() override { + if (term_.empty() || has_advanced_to_end_) { + return false; + } + + has_advanced_to_end_ = true; + return true; + } + + Token GetToken() const override { + if (term_.empty() || !has_advanced_to_end_) { + return Token(Token::Type::INVALID); + } + + return Token(Token::Type::VERBATIM, term_); + } + + libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart() + override { + if (term_.empty()) { + return absl_ports::AbortedError( + "Could not calculate start of empty token."); + } + + return CharacterIterator(term_, 0, 0, 0); + } + + libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive() + override { + if (term_.empty()) { + return absl_ports::AbortedError( + "Could not calculate end of empty token."); + } + + if (token_end_iterator_.utf8_index() >= 0) { + return token_end_iterator_; + } + + bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length()); + if (moved_to_token_end) { + return token_end_iterator_; + } else { + return absl_ports::AbortedError("Could not move to end of token."); + } + } + + bool ResetToTokenStartingAfter(int32_t utf32_offset) override { + // We can only reset to the sole verbatim token, so we must have a negative + // offset for it to be considered the token after. + if (utf32_offset < 0) { + // Because we are now at the sole verbatim token, we should ensure we can + // no longer advance past it. + has_advanced_to_end_ = true; + return true; + } + return false; + } + + bool ResetToTokenEndingBefore(int32_t utf32_offset) override { + // We can only reset to the sole verbatim token, so we must have an offset + // after the end of the token for the reset to be valid. This means the + // provided utf-32 offset must be equal to or greater than the utf-32 length + // of the token. + if (token_end_iterator_.utf8_index() < 0) { + // Moves one index past the end of the term. + bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length()); + if (!moved_to_token_end) { + // We're unable to reset as we failed to move to the end of the term. + return false; + } + } + + if (utf32_offset >= token_end_iterator_.utf32_index()) { + // Because we are now at the sole verbatim token, we should ensure we can + // no longer advance past it. + has_advanced_to_end_ = true; + return true; + } + return false; + } + + bool ResetToStart() override { + has_advanced_to_end_ = true; + return true; + } + + private: + std::string_view term_; + CharacterIterator token_end_iterator_ = CharacterIterator(term_, -1, -1, -1); + // Used to determine whether we have advanced on the sole verbatim token + bool has_advanced_to_end_ = false; +}; + +libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> +VerbatimTokenizer::Tokenize(std::string_view text) const { + return std::make_unique<VerbatimTokenIterator>(text); +} + +libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll( + std::string_view text) const { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator, + Tokenize(text)); + std::vector<Token> tokens; + while (iterator->Advance()) { + tokens.push_back(iterator->GetToken()); + } + return tokens; +} + +} // namespace lib +} // namespace icing diff --git a/icing/tokenization/verbatim-tokenizer.h b/icing/tokenization/verbatim-tokenizer.h new file mode 100644 index 0000000..8404cf1 --- /dev/null +++ b/icing/tokenization/verbatim-tokenizer.h @@ -0,0 +1,41 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_TOKENIZATION_VERBATIM_H_ +#define ICING_TOKENIZATION_VERBATIM_H_ + +#include <memory> +#include <string_view> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/tokenization/tokenizer.h" + +namespace icing { +namespace lib { + +// Provides verbatim tokenization on input text +class VerbatimTokenizer : public Tokenizer { + public: + libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize( + std::string_view text) const override; + + libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( + std::string_view text) const override; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_TOKENIZATION_VERBATIM_H_ diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc new file mode 100644 index 0000000..e38c7aa --- /dev/null +++ b/icing/tokenization/verbatim-tokenizer_test.cc @@ -0,0 +1,209 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string_view> + +#include "gmock/gmock.h" +#include "icing/portable/platform.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" +#include "icing/testing/jni-test-helpers.h" +#include "icing/testing/test-data.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/tokenizer-factory.h" +#include "icing/util/character-iterator.h" +#include "unicode/uloc.h" + +namespace icing { +namespace lib { +namespace { +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; + +class VerbatimTokenizerTest : public ::testing::Test { + protected: + void SetUp() override { + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + jni_cache_ = GetTestJniCache(); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); + ICING_ASSERT_OK_AND_ASSIGN( + language_segmenter_, + language_segmenter_factory::Create(std::move(options))); + } + + std::unique_ptr<const JniCache> jni_cache_; + std::unique_ptr<LanguageSegmenter> language_segmenter_; +}; + +TEST_F(VerbatimTokenizerTest, Empty) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + EXPECT_THAT(verbatim_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty())); +} + +TEST_F(VerbatimTokenizerTest, Simple) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + EXPECT_THAT( + verbatim_tokenizer->TokenizeAll("foo bar"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::VERBATIM, "foo bar")))); +} + +TEST_F(VerbatimTokenizerTest, Punctuation) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + EXPECT_THAT(verbatim_tokenizer->TokenizeAll("Hello, world!"), + IsOkAndHolds(ElementsAre( + EqualsToken(Token::Type::VERBATIM, "Hello, world!")))); +} + +TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + constexpr std::string_view kText = "Hello, world!"; + auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + + // We should get an invalid token if we get the token before advancing. + EXPECT_THAT(token_iterator->GetToken(), + EqualsToken(Token::Type::INVALID, "")); +} + +TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + constexpr std::string_view kText = "Hello, world!"; + auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + + // Reset to beginning of verbatim of token. We provide an offset of 13 as it + // is larger than the final index (12) of the verbatim token. + EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13)); + EXPECT_THAT(token_iterator->GetToken(), + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); + + // Ensure our cached character iterator propertly maintains the end of the + // verbatim token. + EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13)); + EXPECT_THAT(token_iterator->GetToken(), + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); + + // We should not be able to reset with an offset before or within + // the verbatim token's utf-32 length. + EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(0)); + EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(12)); +} + +TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + constexpr std::string_view kText = "Hello, world!"; + auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + + // Get token without resetting + EXPECT_TRUE(token_iterator->Advance()); + EXPECT_THAT(token_iterator->GetToken(), + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); + + // We expect a sole verbatim token, so it's not possible to reset after the + // start of the token. + EXPECT_FALSE(token_iterator->ResetToTokenStartingAfter(1)); + + // We expect to be reset to the sole verbatim token when the offset is + // negative. + EXPECT_TRUE(token_iterator->ResetToTokenStartingAfter(-1)); + EXPECT_THAT(token_iterator->GetToken(), + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); +} + +TEST_F(VerbatimTokenizerTest, ResetToStart) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + constexpr std::string_view kText = "Hello, world!"; + auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + + // Get token without resetting + EXPECT_TRUE(token_iterator->Advance()); + EXPECT_THAT(token_iterator->GetToken(), + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); + + // Retrieve token again after resetting to start + EXPECT_TRUE(token_iterator->ResetToStart()); + EXPECT_THAT(token_iterator->GetToken(), + EqualsToken(Token::Type::VERBATIM, "Hello, world!")); +} + +TEST_F(VerbatimTokenizerTest, CalculateTokenStart) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + constexpr std::string_view kText = "Hello, world!"; + auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + + ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator start_character_iterator, + token_iterator->CalculateTokenStart()); + + // We should retrieve the character 'H', the first character of the token. + EXPECT_THAT(start_character_iterator.GetCurrentChar(), Eq('H')); +} + +TEST_F(VerbatimTokenizerTest, CalculateTokenEnd) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::VERBATIM, + language_segmenter_.get())); + + constexpr std::string_view kText = "Hello, world!"; + auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + + ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator end_character_iterator, + token_iterator->CalculateTokenEndExclusive()); + + // We should retrieve the the null character, as the returned character + // iterator will be set one past the end of the token. + EXPECT_THAT(end_character_iterator.GetCurrentChar(), Eq('\0')); +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc index 8d09be2..fdd4c70 100644 --- a/icing/transform/icu/icu-normalizer_benchmark.cc +++ b/icing/transform/icu/icu-normalizer_benchmark.cc @@ -14,8 +14,8 @@ #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc index a46fcc7..143da17 100644 --- a/icing/transform/icu/icu-normalizer_test.cc +++ b/icing/transform/icu/icu-normalizer_test.cc @@ -16,8 +16,8 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/icu-data-file-helper.h" #include "icing/testing/icu-i18n-test-utils.h" #include "icing/testing/test-data.h" #include "icing/transform/normalizer-factory.h" diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc index d483031..0ab1e50 100644 --- a/icing/util/character-iterator.cc +++ b/icing/util/character-iterator.cc @@ -49,6 +49,8 @@ bool CharacterIterator::MoveToUtf8(int desired_utf8_index) { } bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) { + ResetToStartIfNecessary(); + if (desired_utf8_index > text_.length()) { // Enforce the requirement. return false; @@ -120,6 +122,8 @@ bool CharacterIterator::MoveToUtf16(int desired_utf16_index) { } bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) { + ResetToStartIfNecessary(); + UChar32 uchar32 = cached_current_char_; while (utf16_index_ < desired_utf16_index) { uchar32 = @@ -190,6 +194,8 @@ bool CharacterIterator::MoveToUtf32(int desired_utf32_index) { } bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) { + ResetToStartIfNecessary(); + UChar32 uchar32 = cached_current_char_; while (utf32_index_ < desired_utf32_index) { uchar32 = @@ -249,5 +255,15 @@ bool CharacterIterator::RewindToUtf32(int desired_utf32_index) { return true; } +void CharacterIterator::ResetToStartIfNecessary() { + if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) { + utf8_index_ = 0; + utf16_index_ = 0; + utf32_index_ = 0; + cached_current_char_ = + i18n_utils::GetUChar32At(text_.data(), text_.length(), 0); + } +} + } // namespace lib } // namespace icing diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h index c7569a7..893718a 100644 --- a/icing/util/character-iterator.h +++ b/icing/util/character-iterator.h @@ -99,6 +99,10 @@ class CharacterIterator { } private: + // Resets the character iterator to the start of the text if any of the + // indices are negative. + void ResetToStartIfNecessary(); + std::string_view text_; UChar32 cached_current_char_; int utf8_index_; diff --git a/icing/util/character-iterator_test.cc b/icing/util/character-iterator_test.cc index 445f837..195a47b 100644 --- a/icing/util/character-iterator_test.cc +++ b/icing/util/character-iterator_test.cc @@ -231,5 +231,36 @@ TEST(CharacterIteratorTest, InvalidUtf) { EXPECT_THAT(iterator, Eq(exp_iterator)); } +TEST(CharacterIteratorTest, MoveToUtfNegativeIndex) { + constexpr std::string_view kText = "¿Dónde está la biblioteca?"; + + CharacterIterator iterator_utf8(kText, /*utf8_index=*/-1, /*utf16_index=*/0, + /*utf32_index=*/0); + // We should be able to successfully move when the index is negative. + EXPECT_THAT(iterator_utf8.MoveToUtf8(0), IsTrue()); + // The character cache should be reset and contain the first character when + // resetting to index 0. + EXPECT_THAT(UCharToString(iterator_utf8.GetCurrentChar()), Eq("¿")); + EXPECT_THAT(iterator_utf8.utf8_index(), Eq(0)); + EXPECT_THAT(iterator_utf8.utf16_index(), Eq(0)); + EXPECT_THAT(iterator_utf8.utf32_index(), Eq(0)); + + CharacterIterator iterator_utf16(kText, /*utf8_index=*/0, /*utf16_index=*/-1, + /*utf32_index=*/0); + EXPECT_THAT(iterator_utf16.MoveToUtf16(1), IsTrue()); + EXPECT_THAT(iterator_utf16.GetCurrentChar(), Eq('D')); + EXPECT_THAT(iterator_utf16.utf8_index(), Eq(2)); + EXPECT_THAT(iterator_utf16.utf16_index(), Eq(1)); + EXPECT_THAT(iterator_utf16.utf32_index(), Eq(1)); + + CharacterIterator iterator_utf32(kText, /*utf8_index=*/0, /*utf16_index=*/0, + /*utf32_index=*/-1); + EXPECT_THAT(iterator_utf32.MoveToUtf32(2), IsTrue()); + EXPECT_THAT(UCharToString(iterator_utf32.GetCurrentChar()), Eq("ó")); + EXPECT_THAT(iterator_utf32.utf8_index(), Eq(3)); + EXPECT_THAT(iterator_utf32.utf16_index(), Eq(2)); + EXPECT_THAT(iterator_utf32.utf32_index(), Eq(2)); +} + } // namespace lib } // namespace icing diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc index cb013d7..2261c37 100644 --- a/icing/util/document-validator_test.cc +++ b/icing/util/document-validator_test.cc @@ -46,15 +46,15 @@ constexpr char kPropertyEmails[] = "emails"; constexpr char kDefaultNamespace[] = "icing"; constexpr char kDefaultString[] = "This is a string."; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = - PropertyConfigProto_Cardinality_Code_OPTIONAL; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = - PropertyConfigProto_Cardinality_Code_REQUIRED; -constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = - PropertyConfigProto_Cardinality_Code_REPEATED; - -constexpr PropertyConfigProto_DataType_Code TYPE_STRING = - PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL = + PropertyConfigProto::Cardinality::OPTIONAL; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED = + PropertyConfigProto::Cardinality::REQUIRED; +constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED = + PropertyConfigProto::Cardinality::REPEATED; + +constexpr PropertyConfigProto::DataType::Code TYPE_STRING = + PropertyConfigProto::DataType::STRING; class DocumentValidatorTest : public ::testing::Test { protected: diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java index cb28331..a46814c 100644 --- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java +++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java @@ -53,7 +53,9 @@ import com.google.android.icing.proto.StringIndexingConfig; import com.google.android.icing.proto.StringIndexingConfig.TokenizerType; import com.google.android.icing.proto.SuggestionResponse; import com.google.android.icing.proto.SuggestionSpecProto; +import com.google.android.icing.proto.SuggestionSpecProto.SuggestionScoringSpecProto; import com.google.android.icing.proto.TermMatchType; +import com.google.android.icing.proto.TermMatchType.Code; import com.google.android.icing.proto.UsageReport; import com.google.android.icing.IcingSearchEngine; import java.io.File; @@ -650,7 +652,14 @@ public final class IcingSearchEngineTest { assertStatusOk(icingSearchEngine.put(emailDocument2).getStatus()); SuggestionSpecProto suggestionSpec = - SuggestionSpecProto.newBuilder().setPrefix("f").setNumToReturn(10).build(); + SuggestionSpecProto.newBuilder() + .setPrefix("f") + .setNumToReturn(10) + .setScoringSpec( + SuggestionScoringSpecProto.newBuilder() + .setScoringMatchType(Code.EXACT_ONLY) + .build()) + .build(); SuggestionResponse response = icingSearchEngine.searchSuggestions(suggestionSpec); assertStatusOk(response.getStatus()); diff --git a/proto/icing/proto/debug.proto b/proto/icing/proto/debug.proto new file mode 100644 index 0000000..504ae43 --- /dev/null +++ b/proto/icing/proto/debug.proto @@ -0,0 +1,127 @@ +// Copyright 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package icing.lib; + +import "icing/proto/schema.proto"; +import "icing/proto/status.proto"; +import "icing/proto/storage.proto"; + +option java_package = "com.google.android.icing.proto"; +option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + +// Next tag: 4 +message IndexDebugInfoProto { + // Storage information of the index. + optional IndexStorageInfoProto index_storage_info = 1; + + message MainIndexDebugInfoProto { + // Information about the main lexicon. + // TODO(b/222349894) Convert the string output to a protocol buffer instead. + optional string lexicon_info = 1; + + // Last added document id. + optional uint32 last_added_document_id = 2; + + // If verbosity > 0, return information about the posting list storage. + // TODO(b/222349894) Convert the string output to a protocol buffer instead. + optional string flash_index_storage_info = 3; + } + optional MainIndexDebugInfoProto main_index_info = 2; + + message LiteIndexDebugInfoProto { + // Current number of hits. + optional uint32 curr_size = 1; + + // The maximum possible number of hits. + optional uint32 hit_buffer_size = 2; + + // Last added document id. + optional uint32 last_added_document_id = 3; + + // The first position in the hit buffer that is not sorted yet, + // or curr_size if all hits are sorted. + optional uint32 searchable_end = 4; + + // The most recent checksum of the lite index, by calling + // LiteIndex::ComputeChecksum(). + optional uint32 index_crc = 5; + + // Information about the lite lexicon. + // TODO(b/222349894) Convert the string output to a protocol buffer instead. + optional string lexicon_info = 6; + } + optional LiteIndexDebugInfoProto lite_index_info = 3; +} + +// Next tag: 4 +message DocumentDebugInfoProto { + // Storage information of the document store. + optional DocumentStorageInfoProto document_storage_info = 1; + + // The most recent checksum of the document store, by calling + // DocumentStore::ComputeChecksum(). + optional uint32 crc = 2; + + message CorpusInfo { + optional string namespace = 1; + optional string schema = 2; + optional uint32 total_documents = 3; + optional uint32 total_token = 4; + } + + // If verbosity > 0, return the total number of documents and tokens in each + // (namespace, schema type) pair. + // Note that deleted and expired documents are skipped in the output. + repeated CorpusInfo corpus_info = 3; +} + +// Next tag: 3 +message SchemaDebugInfoProto { + // Copy of the SchemaProto if it has been set in the schema store. + // Modifying this does not affect the Schema that IcingSearchEngine holds. + optional SchemaProto schema = 1; + + // The most recent checksum of the schema store, by calling + // SchemaStore::ComputeChecksum(). + optional uint32 crc = 2; +} + +// Next tag: 4 +message DebugInfoProto { + // Debug information of the index. + optional IndexDebugInfoProto index_info = 1; + + // Debug information of the document store. + optional DocumentDebugInfoProto document_info = 2; + + // Debug information of the schema store. + optional SchemaDebugInfoProto schema_info = 3; +} + +// Next tag: 3 +message DebugInfoResultProto { + // Status code can be one of: + // OK + // FAILED_PRECONDITION + // + // See status.proto for more details. + optional StatusProto status = 1; + + // Debug information for Icing. + optional DebugInfoProto debug_info = 2; +} diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto index 2e8321b..1a501e7 100644 --- a/proto/icing/proto/document.proto +++ b/proto/icing/proto/document.proto @@ -209,7 +209,7 @@ message DeleteBySchemaTypeResultProto { } // Result of a call to IcingSearchEngine.DeleteByQuery -// Next tag: 4 +// Next tag: 5 message DeleteByQueryResultProto { // Status code can be one of: // OK @@ -226,5 +226,18 @@ message DeleteByQueryResultProto { // Stats for delete execution performance. optional DeleteByQueryStatsProto delete_by_query_stats = 3; + // Used by DeleteByQueryResultProto to return information about deleted + // documents. + message DocumentGroupInfo { + optional string namespace = 1; + optional string schema = 2; + repeated string uris = 3; + } + + // Additional return message that shows the uris of the deleted documents, if + // users set return_deleted_document_info to true. + // The result is grouped by the corresponding namespace and type. + repeated DocumentGroupInfo deleted_documents = 4; + reserved 2; } diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto index ab2556d..7fe1e6f 100644 --- a/proto/icing/proto/initialize.proto +++ b/proto/icing/proto/initialize.proto @@ -30,19 +30,6 @@ message IcingSearchEngineOptions { // the index saved by the last instance. optional string base_dir = 1; - // The maximum number of tokens to be allowed per document. If a document - // exceeds this number of tokens, then only the first max_tokens_per_doc - // will be indexed. - // - // Clients may use this value to prevent the possibility of a select few - // documents from exhausting limits in the index that are shared between all - // documents (ie max allowed index size). - // - // Valid values: [1, INT_MAX], Current default is 1/5 of the default of - // max_document_size. - // Optional. - optional int32 max_tokens_per_doc = 2 [default = 13107]; - // The maximum allowable token length. All tokens in excess of this size // will be truncated to max_token_length before being indexed. // @@ -70,6 +57,8 @@ message IcingSearchEngineOptions { // Valid values: [1, INT_MAX] // Optional. optional int32 index_merge_size = 4 [default = 1048576]; // 1 MiB + + reserved 2; } // Result of a call to IcingSearchEngine.Initialize diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto index 2f1f271..0a7c4a6 100644 --- a/proto/icing/proto/logging.proto +++ b/proto/icing/proto/logging.proto @@ -46,6 +46,9 @@ message InitializeStatsProto { // Random I/O errors. IO_ERROR = 4; + + // The document log is using legacy format. + LEGACY_DOCUMENT_LOG_FORMAT = 5; } // Possible recovery causes for document store: diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto index c611cbf..ffb6f2c 100644 --- a/proto/icing/proto/schema.proto +++ b/proto/icing/proto/schema.proto @@ -91,6 +91,14 @@ message StringIndexingConfig { // Tokenization for plain text. PLAIN = 1; + + // Tokenizes text in verbatim. This means no normalization or segmentation + // is applied to string values that are tokenized using this type. + // Therefore, the output token is equivalent to the raw string text. For + // example, "Hello, world!" would be tokenized as "Hello, world!" + // preserving punctuation and capitalization, and not creating separate + // tokens between the space. + VERBATIM = 2; } } optional TokenizerType.Code tokenizer_type = 2; diff --git a/proto/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto index a3a64df..71c943e 100644 --- a/proto/icing/proto/scoring.proto +++ b/proto/icing/proto/scoring.proto @@ -116,8 +116,9 @@ message PropertyWeight { // specified, the property weight is discarded. optional string path = 1; - // Property weight, valid values are positive. Zero and negative weights are - // invalid and will result in an error. By default, a property is given a raw, - // pre-normalized weight of 1.0. + // Property weight, valid values are positive and zero. Setting a zero + // property weight will remove scoring contribution for a query term match in + // the property. Negative weights are invalid and will result in an error. + // By default, a property is given a raw, pre-normalized weight of 1.0. optional double weight = 2; } diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index c712ab2..f005c76 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -85,16 +85,16 @@ message ResultSpecProto { // have snippet information provided. If set to 0, snippeting is disabled. optional int32 num_matches_per_property = 2; - // How large of a window to provide. Windows start at max_window_bytes / 2 - // bytes before the middle of the matching token and end at max_window_bytes - // / 2 bytes after the middle of the matching token. Windowing respects - // token boundaries. - // Therefore, the returned window may be smaller than requested. Setting - // max_window_bytes to 0 will disable windowing information. If matches - // enabled is also set to false, then snippeting is disabled. - // Ex. max_window_bytes = 16. "foo bar baz bat rat" with a query of "baz" + // How large of a window to provide. Windows start at + // max_window_utf32_length / 2 bytes before the middle of the matching token + // and end at max_window_utf32_length / 2 bytes after the middle of the + // matching token. Windowing respects token boundaries. Therefore, the + // returned window may be smaller than requested. Setting + // max_window_utf32_length to 0 will disable windowing information. If + // matches enabled is also set to false, then snippeting is disabled. Ex. + // max_window_utf32_length = 16. "foo bar baz bat rat" with a query of "baz" // will return a window of "bar baz bat" which is only 11 bytes long. - optional int32 max_window_bytes = 3; + optional int32 max_window_utf32_length = 3; } optional SnippetSpecProto snippet_spec = 3; @@ -309,7 +309,7 @@ message GetResultSpecProto { repeated TypePropertyMask type_property_masks = 1; } -// Next tag: 4 +// Next tag: 5 message SuggestionSpecProto { // REQUIRED: The "raw" prefix string that users may type. For example, "f" // will search for suggested query that start with "f" like "foo", "fool". @@ -323,6 +323,23 @@ message SuggestionSpecProto { // REQUIRED: The number of suggestions to be returned. optional int32 num_to_return = 3; + + // Indicates how the suggestion terms should be scored and ranked. + message SuggestionScoringSpecProto { + // TermMatchType.Code=UNKNOWN + // Should never purposely be set and may lead to undefined behavior. This is + // used for backwards compatibility reasons. + // + // TermMatchType.Code=EXACT_ONLY + // Only exact hits will be counted to score a suggestion term. + // + // TermMatchType.Code=PREFIX + // Both exact hits and prefix hits will be counted to score a suggestion + // term. + optional TermMatchType.Code scoring_match_type = 1; + } + + optional SuggestionScoringSpecProto scoring_spec = 4; } // Next tag: 3 diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 7e0431b..73d349b 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=404879391) +set(synced_AOSP_CL_number=436284873) |