aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Dorokhine <adorokhine@google.com>2022-03-22 22:55:15 -0700
committerTim Barron <tjbarron@google.com>2022-03-23 17:04:14 +0000
commit9ab600c39d0b5c87fc7dc4d8155d1efb535f1608 (patch)
tree93ed846d985900e348c166b14818348705d46ea9
parent19600c2c36c5add7e7a792b7e4f742d45b3f871f (diff)
parentc4f46ed536752b4c07f7696e65ff79c2d5086f3f (diff)
downloadicing-9ab600c39d0b5c87fc7dc4d8155d1efb535f1608.tar.gz
Merge remote-tracking branch 'goog/androidx-platform-dev' into tm-dev
* goog/androidx-platform-dev: Sync from upstream. Sync from upstream. Sync from upstream. Descriptions: ====================================================================== Add some additional logging that will help diagnose b/218413237 ====================================================================== Mark VerbatimTokenizer::ResetToTokenStartingAfter as 'override'. ====================================================================== Support dump function for SchemaStore ====================================================================== Refactor DocumentStore::Initialize to improve readability of document store recovery. ====================================================================== Remove non-NDK API usages of ICU4C in libicing. ====================================================================== Move IcuDataFileHelper to the testing directory since it is a test-only util. ====================================================================== Support dump function for DocumentStore ====================================================================== Switch to use PRead rather than MMap in the proto log. ====================================================================== Support dump function for main/lite index and lexicon ====================================================================== Fix LiteIndex::AppendHits ====================================================================== Enable and fix DocumentStoreTest.LoadScoreCacheAndInitializeSuccessfully ====================================================================== Fix MainIndex::GetStorageInfo. ====================================================================== Fix icing-search-engine_fuzz_test by making IcuLanguageSegmenterIterator::Advance non-recursive. ====================================================================== Allow to return additional information for deleted documents in DeleteByQuery ====================================================================== Using enum class in Token::Type for better type safety. ====================================================================== Normalize Tokens by Token type when retrieving snippets ================ Rename max_window_bytes to max_window_utf32_length, Delete the max_tokens_per_doc field in IcingSearchEngineOptions. ================ Handle suggestion namespace ownership. ================ Fix OkStatus() is not a valid argument to StatusOr in Main_index.RetrieveMoreHits. ================ Allow advancing when current indices are negative in CharacterIterator ================ Adds support for verbatim tokenization and indexing in IcingLib ================ Renames TokenizerIterator Reset functions ================ Add term_match_type to SuggestionSpec proto ================ Unify the C++ proto enum style ================ Allow zero property weights in IcingLib Bug: 152934343 Bug: 158089703 Bug: 185845269 Bug: 203700301 Bug: 204333391 Bug: 205209589 Bug: 206147728 Bug: 209071710 Bug: 209993976 Bug: 218413237 Bug: 218413237 Bug: 223549255 Test: Presubmit Change-Id: I96665ba718f89e69ca99cd833ad80fa555edf436
-rw-r--r--icing/file/file-backed-bitmap.cc10
-rw-r--r--icing/file/filesystem.h5
-rw-r--r--icing/file/portable-file-backed-proto-log.h133
-rw-r--r--icing/file/portable-file-backed-proto-log_benchmark.cc12
-rw-r--r--icing/file/portable-file-backed-proto-log_test.cc5
-rw-r--r--icing/icing-search-engine-with-icu-file_test.cc10
-rw-r--r--icing/icing-search-engine.cc112
-rw-r--r--icing/icing-search-engine.h5
-rw-r--r--icing/icing-search-engine_fuzz_test.cc12
-rw-r--r--icing/icing-search-engine_test.cc489
-rw-r--r--icing/index/index-processor.cc20
-rw-r--r--icing/index/index-processor.h2
-rw-r--r--icing/index/index-processor_benchmark.cc2
-rw-r--r--icing/index/index-processor_test.cc131
-rw-r--r--icing/index/index.cc61
-rw-r--r--icing/index/index.h27
-rw-r--r--icing/index/index_test.cc391
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc10
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.cc5
-rw-r--r--icing/index/lite/lite-index.cc47
-rw-r--r--icing/index/lite/lite-index.h12
-rw-r--r--icing/index/lite/lite-index_test.cc110
-rw-r--r--icing/index/main/flash-index-storage.h1
-rw-r--r--icing/index/main/main-index.cc86
-rw-r--r--icing/index/main/main-index.h17
-rw-r--r--icing/index/main/main-index_test.cc28
-rw-r--r--icing/legacy/index/icing-dynamic-trie.cc19
-rw-r--r--icing/legacy/index/icing-dynamic-trie.h9
-rw-r--r--icing/legacy/index/icing-filesystem.h5
-rw-r--r--icing/legacy/index/icing-flash-bitmap.h1
-rw-r--r--icing/query/query-processor_benchmark.cc2
-rw-r--r--icing/query/query-processor_test.cc16
-rw-r--r--icing/query/suggestion-processor.cc11
-rw-r--r--icing/query/suggestion-processor.h2
-rw-r--r--icing/query/suggestion-processor_test.cc80
-rw-r--r--icing/result/result-retriever_test.cc20
-rw-r--r--icing/result/result-state-manager_test.cc4
-rw-r--r--icing/result/result-state_test.cc4
-rw-r--r--icing/result/snippet-retriever.cc94
-rw-r--r--icing/result/snippet-retriever_test.cc173
-rw-r--r--icing/schema/schema-store.cc21
-rw-r--r--icing/schema/schema-store.h12
-rw-r--r--icing/schema/schema-store_test.cc55
-rw-r--r--icing/schema/schema-util_test.cc52
-rw-r--r--icing/scoring/scorer_test.cc8
-rw-r--r--icing/scoring/scoring-processor_test.cc81
-rw-r--r--icing/scoring/section-weights.cc27
-rw-r--r--icing/scoring/section-weights_test.cc109
-rw-r--r--icing/store/document-log-creator.cc9
-rw-r--r--icing/store/document-log-creator.h12
-rw-r--r--icing/store/document-store.cc152
-rw-r--r--icing/store/document-store.h19
-rw-r--r--icing/store/document-store_benchmark.cc10
-rw-r--r--icing/store/document-store_test.cc232
-rw-r--r--icing/store/namespace-checker-impl.h51
-rw-r--r--icing/store/namespace-checker.h42
-rw-r--r--icing/testing/always-true-namespace-checker-impl.h34
-rw-r--r--icing/testing/icu-data-file-helper.cc (renamed from icing/helpers/icu/icu-data-file-helper.cc)2
-rw-r--r--icing/testing/icu-data-file-helper.h (renamed from icing/helpers/icu/icu-data-file-helper.h)6
-rw-r--r--icing/testing/random-string.h1
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc54
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc2
-rw-r--r--icing/tokenization/language-segmenter-iterator_test.cc2
-rw-r--r--icing/tokenization/language-segmenter_benchmark.cc2
-rw-r--r--icing/tokenization/plain-tokenizer.cc16
-rw-r--r--icing/tokenization/plain-tokenizer_test.cc272
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc22
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc524
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc67
-rw-r--r--icing/tokenization/token.h5
-rw-r--r--icing/tokenization/tokenizer-factory.cc3
-rw-r--r--icing/tokenization/tokenizer.h13
-rw-r--r--icing/tokenization/verbatim-tokenizer.cc139
-rw-r--r--icing/tokenization/verbatim-tokenizer.h41
-rw-r--r--icing/tokenization/verbatim-tokenizer_test.cc209
-rw-r--r--icing/transform/icu/icu-normalizer_benchmark.cc2
-rw-r--r--icing/transform/icu/icu-normalizer_test.cc2
-rw-r--r--icing/util/character-iterator.cc16
-rw-r--r--icing/util/character-iterator.h4
-rw-r--r--icing/util/character-iterator_test.cc31
-rw-r--r--icing/util/document-validator_test.cc18
-rw-r--r--java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java11
-rw-r--r--proto/icing/proto/debug.proto127
-rw-r--r--proto/icing/proto/document.proto15
-rw-r--r--proto/icing/proto/initialize.proto15
-rw-r--r--proto/icing/proto/logging.proto3
-rw-r--r--proto/icing/proto/schema.proto8
-rw-r--r--proto/icing/proto/scoring.proto7
-rw-r--r--proto/icing/proto/search.proto37
-rw-r--r--synced_AOSP_CL_number.txt2
90 files changed, 3432 insertions, 1355 deletions
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc
index f1e568c..eec7668 100644
--- a/icing/file/file-backed-bitmap.cc
+++ b/icing/file/file-backed-bitmap.cc
@@ -50,7 +50,7 @@ FileBackedBitmap::Create(const Filesystem* filesystem,
auto bitmap = std::unique_ptr<FileBackedBitmap>(
new FileBackedBitmap(filesystem, file_path, mmap_strategy));
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = bitmap->Initialize();
if (!status.ok()) {
@@ -122,7 +122,7 @@ libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() {
<< " of size: " << file_size;
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, file_size);
if (!status.ok()) {
@@ -198,7 +198,7 @@ int FileBackedBitmap::NumBits() const {
libtextclassifier3::Status FileBackedBitmap::Set(int bit_index,
bool bit_value) {
if (bit_index >= NumBits()) {
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = GrowTo(bit_index);
if (!status.ok()) {
@@ -261,7 +261,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
file_path_.c_str(), new_file_size));
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
if (!status.ok()) {
@@ -281,7 +281,7 @@ libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) {
}
const size_t new_file_size = FileSizeForBits(new_num_bits);
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
if (!status.ok()) {
diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h
index ca8c4a8..dd2c5d1 100644
--- a/icing/file/filesystem.h
+++ b/icing/file/filesystem.h
@@ -233,6 +233,11 @@ class Filesystem {
// Increments to_increment by size if size is valid, or sets to_increment
// to kBadFileSize if either size or to_increment is kBadFileSize.
static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment);
+
+ // Return -1 if file_size is invalid. Otherwise, return file_size.
+ static int64_t SanitizeFileSize(int64_t file_size) {
+ return (file_size != kBadFileSize) ? file_size : -1;
+ }
};
// LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h)
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
index f676dc5..409ab96 100644
--- a/icing/file/portable-file-backed-proto-log.h
+++ b/icing/file/portable-file-backed-proto-log.h
@@ -124,6 +124,8 @@ class PortableFileBackedProtoLog {
public:
static constexpr int32_t kMagic = 0xf4c6f67a;
+ // We should go directly from 0 to 2 the next time we have to change the
+ // format.
static constexpr int32_t kFileFormatVersion = 0;
uint32_t CalculateHeaderChecksum() const {
@@ -282,7 +284,7 @@ class PortableFileBackedProtoLog {
// before updating our checksum.
bool recalculated_checksum = false;
- bool has_data_loss() {
+ bool has_data_loss() const {
return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
}
};
@@ -376,8 +378,7 @@ class PortableFileBackedProtoLog {
// }
class Iterator {
public:
- Iterator(const Filesystem& filesystem, const std::string& file_path,
- int64_t initial_offset);
+ Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset);
// Advances to the position of next proto whether it has been erased or not.
//
@@ -393,11 +394,12 @@ class PortableFileBackedProtoLog {
private:
static constexpr int64_t kInvalidOffset = -1;
// Used to read proto metadata
- MemoryMappedFile mmapped_file_;
// Offset of first proto
+ const Filesystem* const filesystem_;
int64_t initial_offset_;
int64_t current_offset_;
int64_t file_size_;
+ int fd_;
};
// Returns an iterator of current proto log. The caller needs to keep the
@@ -513,7 +515,7 @@ class PortableFileBackedProtoLog {
const Filesystem* filesystem, const std::string& file_path,
Crc32 initial_crc, int64_t start, int64_t end);
- // Reads out the metadata of a proto located at file_offset from the file.
+ // Reads out the metadata of a proto located at file_offset from the fd.
// Metadata will be returned in host byte order endianness.
//
// Returns:
@@ -521,7 +523,8 @@ class PortableFileBackedProtoLog {
// OUT_OF_RANGE_ERROR if file_offset exceeds file_size
// INTERNAL_ERROR if the metadata is invalid or any IO errors happen
static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
- MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
+ const Filesystem* const filesystem, int fd, int64_t file_offset,
+ int64_t file_size);
// Writes metadata of a proto to the fd. Takes in a host byte order endianness
// metadata and converts it into a portable metadata before writing.
@@ -937,35 +940,37 @@ template <typename ProtoT>
libtextclassifier3::StatusOr<ProtoT>
PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
- MemoryMappedFile mmapped_file(*filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_ONLY);
- if (file_offset >= file_size) {
- // file_size points to the next byte to write at, so subtract one to get
- // the inclusive, actual size of file.
- return absl_ports::OutOfRangeError(
- IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
- "out of range of the file size, %lld",
- static_cast<long long>(file_offset),
- static_cast<long long>(file_size - 1)));
- }
-
// Read out the metadata
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::OutOfRangeError("Unable to correctly read size.");
+ }
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+ ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
// Copy out however many bytes it says the proto is
int stored_size = GetProtoSize(metadata);
+ file_offset += sizeof(metadata);
- ICING_RETURN_IF_ERROR(
- mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+ // Read the compressed proto out.
+ if (file_offset + stored_size > file_size) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+ auto buf = std::make_unique<char[]>(stored_size);
+ if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
- if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+ if (IsEmptyBuffer(buf.get(), stored_size)) {
return absl_ports::NotFoundError("The proto data has been erased.");
}
- google::protobuf::io::ArrayInputStream proto_stream(
- mmapped_file.mutable_region(), stored_size);
+ google::protobuf::io::ArrayInputStream proto_stream(buf.get(),
+ stored_size);
// Deserialize proto
ProtoT proto;
@@ -983,33 +988,29 @@ template <typename ProtoT>
libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
int64_t file_offset) {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
- if (file_offset >= file_size) {
- // file_size points to the next byte to write at, so subtract one to get
- // the inclusive, actual size of file.
- return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
- "Trying to erase data at a location, %lld, "
- "out of range of the file size, %lld",
- static_cast<long long>(file_offset),
- static_cast<long long>(file_size - 1)));
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::OutOfRangeError("Unable to correctly read size.");
}
- MemoryMappedFile mmapped_file(
- *filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
-
- // Read out the metadata
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(&mmapped_file, file_offset, file_size));
-
- ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
- GetProtoSize(metadata)));
+ ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
+ // Copy out however many bytes it says the proto is
+ int stored_size = GetProtoSize(metadata);
+ file_offset += sizeof(metadata);
+ if (file_offset + stored_size > file_size) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+ auto buf = std::make_unique<char[]>(stored_size);
// We need to update the crc checksum if the erased area is before the
// rewind position.
int32_t new_crc;
- int64_t erased_proto_offset = file_offset + sizeof(metadata);
- if (erased_proto_offset < header_->GetRewindOffset()) {
+ if (file_offset < header_->GetRewindOffset()) {
// Set to "dirty" before we start writing anything.
header_->SetDirtyFlag(true);
header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -1022,24 +1023,30 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
// We need to calculate [original string xor 0s].
// The xored string is the same as the original string because 0 xor 0 =
// 0, 1 xor 0 = 1.
- const std::string_view xored_str(mmapped_file.region(),
- mmapped_file.region_size());
+ // Read the compressed proto out.
+ if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
+ const std::string_view xored_str(buf.get(), stored_size);
Crc32 crc(header_->GetLogChecksum());
ICING_ASSIGN_OR_RETURN(
- new_crc, crc.UpdateWithXor(
- xored_str,
- /*full_data_size=*/header_->GetRewindOffset() -
- kHeaderReservedBytes,
- /*position=*/erased_proto_offset - kHeaderReservedBytes));
+ new_crc,
+ crc.UpdateWithXor(xored_str,
+ /*full_data_size=*/header_->GetRewindOffset() -
+ kHeaderReservedBytes,
+ /*position=*/file_offset - kHeaderReservedBytes));
}
// Clear the region.
- memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+ memset(buf.get(), '\0', stored_size);
+ if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
+ return absl_ports::InternalError("");
+ }
// If we cleared something in our checksummed area, we should update our
// checksum and reset our dirty bit.
- if (erased_proto_offset < header_->GetRewindOffset()) {
+ if (file_offset < header_->GetRewindOffset()) {
header_->SetDirtyFlag(false);
header_->SetLogChecksum(new_crc);
header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -1077,13 +1084,12 @@ PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
template <typename ProtoT>
PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
- const Filesystem& filesystem, const std::string& file_path,
- int64_t initial_offset)
- : mmapped_file_(filesystem, file_path,
- MemoryMappedFile::Strategy::READ_ONLY),
+ const Filesystem& filesystem, int fd, int64_t initial_offset)
+ : filesystem_(&filesystem),
initial_offset_(initial_offset),
current_offset_(kInvalidOffset),
- file_size_(filesystem.GetFileSize(file_path.c_str())) {
+ fd_(fd) {
+ file_size_ = filesystem_->GetFileSize(fd_);
if (file_size_ == Filesystem::kBadFileSize) {
// Fails all Advance() calls
file_size_ = 0;
@@ -1100,7 +1106,7 @@ PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
// Jumps to the next proto position
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
+ ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
}
@@ -1122,14 +1128,15 @@ int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
template <typename ProtoT>
typename PortableFileBackedProtoLog<ProtoT>::Iterator
PortableFileBackedProtoLog<ProtoT>::GetIterator() {
- return Iterator(*filesystem_, file_path_,
+ return Iterator(*filesystem_, fd_.get(),
/*initial_offset=*/kHeaderReservedBytes);
}
template <typename ProtoT>
libtextclassifier3::StatusOr<int32_t>
PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
- MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
+ const Filesystem* const filesystem, int fd, int64_t file_offset,
+ int64_t file_size) {
// Checks file_offset
if (file_offset >= file_size) {
return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
@@ -1147,9 +1154,9 @@ PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
static_cast<long long>(file_size)));
}
- // Reads metadata
- ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
- memcpy(&portable_metadata, mmapped_file->region(), metadata_size);
+ if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
// Need to switch it back to host order endianness after reading from disk.
int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc
index f83ccd6..80a8011 100644
--- a/icing/file/portable-file-backed-proto-log_benchmark.cc
+++ b/icing/file/portable-file-backed-proto-log_benchmark.cc
@@ -55,7 +55,7 @@ namespace lib {
namespace {
-static void BM_Write(benchmark::State& state) {
+void BM_Write(benchmark::State& state) {
const Filesystem filesystem;
int string_length = state.range(0);
const std::string file_path = IcingStringUtil::StringPrintf(
@@ -108,7 +108,7 @@ BENCHMARK(BM_Write)
// 16MiB, and we need some extra space for the
// rest of the document properties
-static void BM_Read(benchmark::State& state) {
+void BM_Read(benchmark::State& state) {
const Filesystem filesystem;
int string_length = state.range(0);
const std::string file_path = IcingStringUtil::StringPrintf(
@@ -164,7 +164,7 @@ BENCHMARK(BM_Read)
// 16MiB, and we need some extra space for the
// rest of the document properties
//
-static void BM_Erase(benchmark::State& state) {
+void BM_Erase(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = IcingStringUtil::StringPrintf(
"%s%s", GetTestTempDir().c_str(), "/proto.log");
@@ -204,7 +204,7 @@ static void BM_Erase(benchmark::State& state) {
}
BENCHMARK(BM_Erase);
-static void BM_ComputeChecksum(benchmark::State& state) {
+void BM_ComputeChecksum(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = GetTestTempDir() + "/proto.log";
int max_proto_size = (1 << 24) - 1; // 16 MiB
@@ -246,7 +246,7 @@ static void BM_ComputeChecksum(benchmark::State& state) {
}
BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
-static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
+void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = GetTestTempDir() + "/proto.log";
int max_proto_size = (1 << 24) - 1; // 16 MiB
@@ -290,7 +290,7 @@ static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
}
BENCHMARK(BM_ComputeChecksumWithCachedChecksum);
-static void BM_ComputeChecksumOnlyForTail(benchmark::State& state) {
+void BM_ComputeChecksumOnlyForTail(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = GetTestTempDir() + "/proto.log";
int max_proto_size = (1 << 24) - 1; // 16 MiB
diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
index b5fee4b..795271a 100644
--- a/icing/file/portable-file-backed-proto-log_test.cc
+++ b/icing/file/portable-file-backed-proto-log_test.cc
@@ -851,11 +851,12 @@ TEST_F(PortableFileBackedProtoLogTest, Iterator) {
{
// Iterator with bad filesystem
+ ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str()));
MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, GetFileSize(A<const char*>()))
+ ON_CALL(mock_filesystem, GetFileSize(A<int>()))
.WillByDefault(Return(Filesystem::kBadFileSize));
PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
- mock_filesystem, file_path_, /*initial_offset=*/0);
+ mock_filesystem, sfd.get(), /*initial_offset=*/0);
ASSERT_THAT(bad_iterator.Advance(),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc
index 48e81e5..1012b47 100644
--- a/icing/icing-search-engine-with-icu-file_test.cc
+++ b/icing/icing-search-engine-with-icu-file_test.cc
@@ -37,13 +37,13 @@ namespace {
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::Eq;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
- PropertyConfigProto_Cardinality_Code_REQUIRED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
+ PropertyConfigProto::Cardinality::REQUIRED;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
std::string GetTestBaseDir() {
return GetTestTempDir() + "/icing_with_icu_files";
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 9aa833b..952ba21 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -18,6 +18,7 @@
#include <memory>
#include <string>
#include <string_view>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -59,6 +60,7 @@
#include "icing/scoring/scoring-processor.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
+#include "icing/store/namespace-checker-impl.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
@@ -87,17 +89,22 @@ constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
// fresh state.
constexpr int kMaxUnsuccessfulInitAttempts = 5;
-libtextclassifier3::Status ValidateOptions(
- const IcingSearchEngineOptions& options) {
- // These options are only used in IndexProcessor, which won't be created
- // until the first Put call. So they must be checked here, so that any
- // errors can be surfaced in Initialize.
- if (options.max_tokens_per_doc() <= 0) {
- return absl_ports::InvalidArgumentError(
- "Options::max_tokens_per_doc must be greater than zero.");
+// A pair that holds namespace and type.
+struct NamespaceTypePair {
+ std::string namespace_;
+ std::string type;
+
+ bool operator==(const NamespaceTypePair& other) const {
+ return namespace_ == other.namespace_ && type == other.type;
}
- return libtextclassifier3::Status::OK;
-}
+};
+
+struct NamespaceTypePairHasher {
+ std::size_t operator()(const NamespaceTypePair& pair) const {
+ return std::hash<std::string>()(pair.namespace_) ^
+ std::hash<std::string>()(pair.type);
+ }
+};
libtextclassifier3::Status ValidateResultSpec(
const ResultSpecProto& result_spec) {
@@ -142,6 +149,11 @@ libtextclassifier3::Status ValidateSuggestionSpec(
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("SuggestionSpecProto.prefix is empty!"));
}
+ if (suggestion_spec.scoring_spec().scoring_match_type() ==
+ TermMatchType::UNKNOWN) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!"));
+ }
if (suggestion_spec.num_to_return() <= 0) {
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
"SuggestionSpecProto.num_to_return must be positive."));
@@ -261,6 +273,28 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
status_proto->set_message(internal_status.error_message());
}
+libtextclassifier3::Status RetrieveAndAddDocumentInfo(
+ const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
+ std::unordered_map<NamespaceTypePair,
+ DeleteByQueryResultProto::DocumentGroupInfo*,
+ NamespaceTypePairHasher>& info_map,
+ DocumentId document_id) {
+ ICING_ASSIGN_OR_RETURN(DocumentProto document,
+ document_store->Get(document_id));
+ NamespaceTypePair key = {document.namespace_(), document.schema()};
+ auto iter = info_map.find(key);
+ if (iter == info_map.end()) {
+ auto entry = result_proto.add_deleted_documents();
+ entry->set_namespace_(std::move(document.namespace_()));
+ entry->set_schema(std::move(document.schema()));
+ entry->add_uris(std::move(document.uri()));
+ info_map[key] = entry;
+ } else {
+ iter->second->add_uris(std::move(document.uri()));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
} // namespace
IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
@@ -399,7 +433,6 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() {
libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(initialize_stats);
- ICING_RETURN_IF_ERROR(ValidateOptions(options_));
// Make sure the base directory exists
if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
@@ -450,8 +483,6 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
// last tried to set the schema.
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/true, initialize_stats));
- initialize_stats->set_document_store_recovery_cause(
- InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
// We're going to need to build the index from scratch. So just delete its
// files now.
@@ -941,7 +972,7 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = document_store_->Delete(name_space, uri);
if (!status.ok()) {
@@ -975,7 +1006,7 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteByNamespace(name_space);
@@ -1009,7 +1040,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteBySchemaType(schema_type);
@@ -1027,7 +1058,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
}
DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
- const SearchSpecProto& search_spec) {
+ const SearchSpecProto& search_spec, bool return_deleted_document_info) {
ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
<< " from doc store";
@@ -1081,12 +1112,27 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
ICING_VLOG(2) << "Deleting the docs that matched the query.";
int num_deleted = 0;
+ // A map used to group deleted documents.
+ // From the (namespace, type) pair to a list of uris.
+ std::unordered_map<NamespaceTypePair,
+ DeleteByQueryResultProto::DocumentGroupInfo*,
+ NamespaceTypePairHasher>
+ deleted_info_map;
component_timer = clock_->GetNewTimer();
while (query_results.root_iterator->Advance().ok()) {
ICING_VLOG(3) << "Deleting doc "
<< query_results.root_iterator->doc_hit_info().document_id();
++num_deleted;
+ if (return_deleted_document_info) {
+ status = RetrieveAndAddDocumentInfo(
+ document_store_.get(), result_proto, deleted_info_map,
+ query_results.root_iterator->doc_hit_info().document_id());
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+ }
status = document_store_->Delete(
query_results.root_iterator->doc_hit_info().document_id());
if (!status.ok()) {
@@ -1155,12 +1201,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer();
OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- if (before_size != Filesystem::kBadFileSize) {
- optimize_stats->set_storage_size_before(before_size);
- } else {
- // Set -1 as a sentinel value when failures occur.
- optimize_stats->set_storage_size_before(-1);
- }
+ optimize_stats->set_storage_size_before(
+ Filesystem::SanitizeFileSize(before_size));
// Flushes data to disk before doing optimization
auto status = InternalPersistToDisk(PersistType::FULL);
@@ -1237,12 +1279,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
optimize_status_file.Write(std::move(optimize_status));
int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- if (after_size != Filesystem::kBadFileSize) {
- optimize_stats->set_storage_size_after(after_size);
- } else {
- // Set -1 as a sentinel value when failures occur.
- optimize_stats->set_storage_size_after(-1);
- }
+ optimize_stats->set_storage_size_after(
+ Filesystem::SanitizeFileSize(after_size));
optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds());
TransformStatus(optimization_status, result_status);
@@ -1324,11 +1362,8 @@ StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
}
int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- if (index_size != Filesystem::kBadFileSize) {
- result.mutable_storage_info()->set_total_storage_size(index_size);
- } else {
- result.mutable_storage_info()->set_total_storage_size(-1);
- }
+ result.mutable_storage_info()->set_total_storage_size(
+ Filesystem::SanitizeFileSize(index_size));
*result.mutable_storage_info()->mutable_document_storage_info() =
document_store_->GetStorageInfo();
*result.mutable_storage_info()->mutable_schema_store_storage_info() =
@@ -1875,19 +1910,22 @@ SuggestionResponse IcingSearchEngine::SearchSuggestions(
std::unique_ptr<SuggestionProcessor> suggestion_processor =
std::move(suggestion_processor_or).ValueOrDie();
- std::vector<NamespaceId> namespace_ids;
+ std::unordered_set<NamespaceId> namespace_ids;
namespace_ids.reserve(suggestion_spec.namespace_filters_size());
for (std::string_view name_space : suggestion_spec.namespace_filters()) {
auto namespace_id_or = document_store_->GetNamespaceId(name_space);
if (!namespace_id_or.ok()) {
continue;
}
- namespace_ids.push_back(namespace_id_or.ValueOrDie());
+ namespace_ids.insert(namespace_id_or.ValueOrDie());
}
// Run suggestion based on given SuggestionSpec.
+ NamespaceCheckerImpl namespace_checker_impl(document_store_.get(),
+ std::move(namespace_ids));
libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or =
- suggestion_processor->QuerySuggestions(suggestion_spec, namespace_ids);
+ suggestion_processor->QuerySuggestions(suggestion_spec,
+ &namespace_checker_impl);
if (!terms_or.ok()) {
TransformStatus(terms_or.status(), response_status);
return response;
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 0a79714..ff9c7fb 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -280,8 +280,9 @@ class IcingSearchEngine {
// NOT_FOUND if the query doesn't match any documents
// FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
- DeleteByQueryResultProto DeleteByQuery(const SearchSpecProto& search_spec)
- ICING_LOCKS_EXCLUDED(mutex_);
+ DeleteByQueryResultProto DeleteByQuery(
+ const SearchSpecProto& search_spec,
+ bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_);
// Retrieves, scores, ranks, and returns the results according to the specs.
// Results can be empty. If there're multiple pages of results,
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index 2d07e37..bf486da 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -18,12 +18,12 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/document-builder.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/icing-search-engine.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/schema-builder.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -31,13 +31,13 @@ namespace icing {
namespace lib {
namespace {
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
- PropertyConfigProto_Cardinality_Code_REQUIRED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
+ PropertyConfigProto::Cardinality::REQUIRED;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
IcingSearchEngineOptions Setup() {
IcingSearchEngineOptions icing_options;
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index b5206cd..7ed8885 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -27,7 +27,6 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
#include "icing/portable/endian.h"
#include "icing/portable/equals-proto.h"
@@ -46,6 +45,7 @@
#include "icing/store/document-log-creator.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/snippet-helpers.h"
@@ -90,24 +90,24 @@ constexpr std::string_view kIpsumText =
"vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh "
"placerat semper.";
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
- PropertyConfigProto_Cardinality_Code_REQUIRED;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
- PropertyConfigProto_Cardinality_Code_REPEATED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
+ PropertyConfigProto::Cardinality::REQUIRED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE =
- StringIndexingConfig_TokenizerType_Code_NONE;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE =
+ StringIndexingConfig::TokenizerType::NONE;
#ifndef ICING_JNI_TEST
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
#endif // !ICING_JNI_TEST
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
-constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
+constexpr TermMatchType::Code MATCH_NONE = TermMatchType::UNKNOWN;
PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader(
Filesystem filesystem, const std::string& file_path) {
@@ -362,36 +362,6 @@ TEST_F(IcingSearchEngineTest, GoodIndexMergeSizeReturnsOk) {
EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
}
-TEST_F(IcingSearchEngineTest,
- NegativeMaxTokensPerDocSizeReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_max_tokens_per_doc(-1);
- IcingSearchEngine icing(options, GetTestJniCache());
- EXPECT_THAT(icing.Initialize().status(),
- ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, ZeroMaxTokensPerDocSizeReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_max_tokens_per_doc(0);
- IcingSearchEngine icing(options, GetTestJniCache());
- EXPECT_THAT(icing.Initialize().status(),
- ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, GoodMaxTokensPerDocSizeReturnsOk) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- // INT_MAX is valid - it just means that we shouldn't limit the number of
- // tokens per document. It would be pretty inconceivable that anyone would
- // produce such a document - the text being indexed alone would take up at
- // least ~4.3 GiB! - and the document would be rejected before indexing
- // for exceeding max_document_size, but there's no reason to explicitly
- // bar it.
- options.set_max_tokens_per_doc(std::numeric_limits<int32_t>::max());
- IcingSearchEngine icing(options, GetTestJniCache());
- EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
-}
-
TEST_F(IcingSearchEngineTest, NegativeMaxTokenLenReturnsInvalidArgument) {
IcingSearchEngineOptions options = GetDefaultIcingOptions();
options.set_max_token_length(-1);
@@ -2198,7 +2168,7 @@ TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) {
search_spec.set_query("message");
ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
@@ -2616,7 +2586,7 @@ TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) {
ResultSpecProto result_spec;
result_spec.set_num_per_page(2);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
result_spec.mutable_snippet_spec()->set_num_to_snippet(3);
@@ -3523,6 +3493,105 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) {
expected_search_result_proto));
}
+TEST_F(IcingSearchEngineTest, DeleteByQueryReturnInfo) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body3")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(7);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs to test the information is correctly grouped.
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ DeleteByQueryResultProto result_proto =
+ icing.DeleteByQuery(search_spec, true);
+ EXPECT_THAT(result_proto.status(), ProtoIsOk());
+ DeleteByQueryStatsProto exp_stats;
+ exp_stats.set_latency_ms(7);
+ exp_stats.set_num_documents_deleted(3);
+ exp_stats.set_query_length(search_spec.query().length());
+ exp_stats.set_num_terms(1);
+ exp_stats.set_num_namespaces_filtered(0);
+ exp_stats.set_num_schema_types_filtered(0);
+ exp_stats.set_parse_query_latency_ms(7);
+ exp_stats.set_document_removal_latency_ms(7);
+ EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats));
+
+ // Check that DeleteByQuery can return information for deleted documents.
+ DeleteByQueryResultProto::DocumentGroupInfo info1, info2;
+ info1.set_namespace_("namespace1");
+ info1.set_schema("Message");
+ info1.add_uris("uri1");
+ info2.set_namespace_("namespace2");
+ info2.set_schema("Message");
+ info2.add_uris("uri3");
+ info2.add_uris("uri2");
+ EXPECT_THAT(result_proto.deleted_documents(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2)));
+
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+}
+
TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) {
DocumentProto document1 =
DocumentBuilder()
@@ -6048,7 +6117,7 @@ TEST_F(IcingSearchEngineTest, SnippetNormalization) {
search_spec.set_query("mdi Zürich");
ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
@@ -6111,7 +6180,7 @@ TEST_F(IcingSearchEngineTest, SnippetNormalizationPrefix) {
search_spec.set_query("md Zür");
ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
@@ -6166,7 +6235,7 @@ TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) {
search_spec.set_query("body:Zür");
ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(10);
result_spec.mutable_snippet_spec()->set_num_to_snippet(10);
@@ -7694,7 +7763,7 @@ TEST_F(IcingSearchEngineTest, QueryStatsProtoTest) {
ResultSpecProto result_spec;
result_spec.set_num_per_page(2);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
result_spec.mutable_snippet_spec()->set_num_to_snippet(3);
@@ -7905,7 +7974,7 @@ TEST_F(IcingSearchEngineTest, SnippetErrorTest) {
ResultSpecProto result_spec;
result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(3);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(4);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(4);
SearchResultProto search_results =
icing.Search(search_spec, scoring_spec, result_spec);
@@ -8110,6 +8179,8 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest) {
SuggestionSpecProto suggestion_spec;
suggestion_spec.set_prefix("t");
suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
// Query all suggestions, and they will be ranked.
SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
@@ -8130,6 +8201,316 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest) {
ASSERT_THAT(response.suggestions().at(2).query(), "termfour");
}
+TEST_F(IcingSearchEngineTest,
+ SearchSuggestionsTest_ShouldReturnInOneNamespace) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ // namespace1 has 2 results.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFoo),
+ EqualsProto(suggestionFool)));
+}
+
+TEST_F(IcingSearchEngineTest,
+ SearchSuggestionsTest_ShouldReturnInMultipleNamespace) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fo")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace3", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ // namespace2 and namespace3 has 2 results.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace2");
+ suggestion_spec.add_namespace_filters("namespace3");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFoo),
+ EqualsProto(suggestionFool)));
+}
+
+TEST_F(IcingSearchEngineTest,
+ SearchSuggestionsTest_OtherNamespaceDontContributeToHitCount) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // Index 4 documents,
+ // namespace1 has 2 hit2 for term one
+ // namespace2 has 2 hit2 for term two and 1 hit for term one.
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termone")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termone")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termone termtwo")
+ .Build();
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace2", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termtwo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionTermOne;
+ suggestionTermOne.set_query("termone");
+ SuggestionResponse::Suggestion suggestionTermTwo;
+ suggestionTermTwo.set_query("termtwo");
+
+ // only search suggestion for namespace2. The correctly order should be
+ // {"termtwo", "termone"}. If we're not filtering out namespace1 when
+ // calculating our score, then it will be {"termone", "termtwo"}.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("t");
+ suggestion_spec.add_namespace_filters("namespace2");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ ElementsAre(EqualsProto(suggestionTermTwo),
+ EqualsProto(suggestionTermOne)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_DeletionTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ // namespace1 has this suggestion
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+
+ // namespace2 has this suggestion
+ suggestion_spec.clear_namespace_filters();
+ suggestion_spec.add_namespace_filters("namespace2");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+
+ // delete document from namespace 1
+ EXPECT_THAT(icing.Delete("namespace1", "uri1").status(), ProtoIsOk());
+
+ // Now namespace1 will return empty
+ suggestion_spec.clear_namespace_filters();
+ suggestion_spec.add_namespace_filters("namespace1");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(), IsEmpty());
+
+ // namespace2 still has this suggestion, so we can prove the reason of
+ // namespace 1 cannot find it is we filter it out, not it doesn't exist.
+ suggestion_spec.add_namespace_filters("namespace2");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+}
+
+TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_ExpiredTest) {
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(500)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(400);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ // namespace1 has this suggestion
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+
+ // namespace2 has this suggestion
+ suggestion_spec.clear_namespace_filters();
+ suggestion_spec.add_namespace_filters("namespace2");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+ }
+ // We reinitialize here so we can feed in a fake clock this time
+ {
+ // Time needs to be past document1 creation time (100) + ttl (500) for it
+ // to count as "expired". document2 is not expired since its ttl is 1000.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(800);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ // Now namespace1 will return empty
+ suggestion_spec.clear_namespace_filters();
+ suggestion_spec.add_namespace_filters("namespace1");
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(), IsEmpty());
+
+ // namespace2 still has this suggestion
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ suggestion_spec.add_namespace_filters("namespace2");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+ }
+}
+
TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_emptyPrefix) {
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
@@ -8137,6 +8518,8 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_emptyPrefix) {
SuggestionSpecProto suggestion_spec;
suggestion_spec.set_prefix("");
suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(),
ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
@@ -8149,6 +8532,8 @@ TEST_F(IcingSearchEngineTest, SearchSuggestionsTest_NonPositiveNumToReturn) {
SuggestionSpecProto suggestion_spec;
suggestion_spec.set_prefix("prefix");
suggestion_spec.set_num_to_return(0);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(),
ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
@@ -8203,7 +8588,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) {
EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
Eq(InitializeStatsProto::NO_DATA_LOSS));
EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
- Eq(InitializeStatsProto::NONE));
+ Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT));
EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
Eq(InitializeStatsProto::NONE));
EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 1aae732..207c033 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -73,9 +73,23 @@ libtextclassifier3::Status IndexProcessor::IndexDocument(
section.metadata.term_match_type, /*namespace_id=*/0);
for (std::string_view token : section.token_sequence) {
++num_tokens;
- std::string term = normalizer_.NormalizeTerm(token);
- // Add this term to Hit buffer.
- status = editor.BufferTerm(term.c_str());
+
+ switch (section.metadata.tokenizer) {
+ case StringIndexingConfig::TokenizerType::VERBATIM:
+ // data() is safe to use here because a token created from the
+ // VERBATIM tokenizer is the entire string value. The character at
+ // data() + token.length() is guaranteed to be a null char.
+ status = editor.BufferTerm(token.data());
+ break;
+ case StringIndexingConfig::TokenizerType::NONE:
+ ICING_LOG(WARNING)
+ << "Unexpected TokenizerType::NONE found when indexing document.";
+ [[fallthrough]];
+ case StringIndexingConfig::TokenizerType::PLAIN:
+ std::string normalized_term = normalizer_.NormalizeTerm(token);
+ status = editor.BufferTerm(normalized_term.c_str());
+ }
+
if (!status.ok()) {
// We've encountered a failure. Bail out. We'll mark this doc as deleted
// and signal a failure to the client.
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
index c4b77b5..269e41c 100644
--- a/icing/index/index-processor.h
+++ b/icing/index/index-processor.h
@@ -69,8 +69,6 @@ class IndexProcessor {
IndexProcessor(const Normalizer* normalizer, Index* index, const Clock* clock)
: normalizer_(*normalizer), index_(index), clock_(*clock) {}
- std::string NormalizeToken(const Token& token);
-
const Normalizer& normalizer_;
Index* const index_;
const Clock& clock_;
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 6e072c7..1aad7d0 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -16,7 +16,6 @@
#include "gmock/gmock.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
#include "icing/legacy/core/icing-string-util.h"
@@ -24,6 +23,7 @@
#include "icing/schema/schema-util.h"
#include "icing/schema/section-manager.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 449bc3e..bd310de 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -30,7 +30,6 @@
#include "icing/absl_ports/str_join.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -49,6 +48,7 @@
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/random-string.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -90,6 +90,8 @@ constexpr std::string_view kRepeatedProperty = "repeated";
constexpr std::string_view kSubProperty = "submessage";
constexpr std::string_view kNestedType = "NestedType";
constexpr std::string_view kNestedProperty = "nested";
+constexpr std::string_view kExactVerbatimProperty = "verbatimExact";
+constexpr std::string_view kPrefixedVerbatimProperty = "verbatimPrefixed";
constexpr DocumentId kDocumentId0 = 0;
constexpr DocumentId kDocumentId1 = 1;
@@ -98,6 +100,8 @@ constexpr SectionId kExactSectionId = 0;
constexpr SectionId kPrefixedSectionId = 1;
constexpr SectionId kRepeatedSectionId = 2;
constexpr SectionId kNestedSectionId = 3;
+constexpr SectionId kExactVerbatimSectionId = 4;
+constexpr SectionId kPrefixedVerbatimSectionId = 5;
using Cardinality = PropertyConfigProto::Cardinality;
using DataType = PropertyConfigProto::DataType;
@@ -106,21 +110,23 @@ using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::Test;
-constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
- PropertyConfigProto_DataType_Code_STRING;
-constexpr PropertyConfigProto_DataType_Code TYPE_BYTES =
- PropertyConfigProto_DataType_Code_BYTES;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_BYTES =
+ PropertyConfigProto::DataType::BYTES;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
- PropertyConfigProto_Cardinality_Code_REPEATED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
+ StringIndexingConfig::TokenizerType::VERBATIM;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
class IndexProcessorTest : public Test {
protected:
@@ -180,6 +186,16 @@ class IndexProcessorTest : public Test {
.SetCardinality(CARDINALITY_REPEATED))
.AddProperty(
PropertyConfigBuilder()
+ .SetName(kExactVerbatimProperty)
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPrefixedVerbatimProperty)
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(
+ PropertyConfigBuilder()
.SetName(kSubProperty)
.SetDataTypeDocument(
kNestedType, /*index_nested_properties=*/true)
@@ -797,6 +813,95 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
}
+TEST_F(IndexProcessorTest, ExactVerbatimProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactVerbatimProperty),
+ "Hello, world!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 1);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("Hello, world!", kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
+ {kExactVerbatimSectionId, 1}};
+
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expectedMap)));
+}
+
+TEST_F(IndexProcessorTest, PrefixVerbatimProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPrefixedVerbatimProperty),
+ "Hello, world!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 1);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // We expect to match the document we indexed as "Hello, w" is a prefix
+ // of "Hello, world!"
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("Hello, w", kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
+ {kPrefixedVerbatimSectionId, 1}};
+
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expectedMap)));
+}
+
+TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPrefixedVerbatimProperty),
+ "Hello, world!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_tokens(), 1);
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("world", kSectionIdMaskAll, TermMatchType::PREFIX));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+
+ // We should not have hits for term "world" as the index processor should
+ // create a sole token "Hello, world! for the document.
+ EXPECT_THAT(hits, IsEmpty());
+}
+
} // namespace
} // namespace lib
diff --git a/icing/index/index.cc b/icing/index/index.cc
index 1bdab21..02ba699 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -71,24 +71,6 @@ IcingDynamicTrie::Options GetMainLexiconOptions() {
return IcingDynamicTrie::Options();
}
-// Helper function to check if a term is in the given namespaces.
-// TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
-bool IsTermInNamespaces(
- const IcingDynamicTrie::PropertyReadersAll& property_reader,
- uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
- if (namespace_ids.empty()) {
- return true;
- }
- for (NamespaceId namespace_id : namespace_ids) {
- if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
- value_index)) {
- return true;
- }
- }
-
- return false;
-}
-
enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms };
// Merge the TermMetadata from lite index and main index. If the term exists in
@@ -137,7 +119,7 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas(
int total_est_hit_count =
lite_term_itr->hit_count + main_term_itr->hit_count;
PushToTermHeap(TermMetadata(std::move(lite_term_itr->content),
- total_est_hit_count),
+ total_est_hit_count),
num_to_return, merged_term_metadata_heap);
++lite_term_itr;
++main_term_itr;
@@ -228,32 +210,26 @@ Index::GetIterator(const std::string& term, SectionIdMask section_id_mask,
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
Index::FindLiteTermsByPrefix(const std::string& prefix,
- const std::vector<NamespaceId>& namespace_ids) {
+ const NamespaceChecker* namespace_checker) {
// Finds all the terms that start with the given prefix in the lexicon.
IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(),
prefix.c_str());
- // A property reader to help check if a term has some property.
- IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon());
-
std::vector<TermMetadata> term_metadata_list;
while (term_iterator.IsValid()) {
uint32_t term_value_index = term_iterator.GetValueIndex();
- // Skips the terms that don't exist in the given namespaces. We won't skip
- // any terms if namespace_ids is empty.
- if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
- term_iterator.Advance();
- continue;
- }
-
ICING_ASSIGN_OR_RETURN(
uint32_t term_id,
term_id_codec_->EncodeTvi(term_value_index, TviType::LITE),
absl_ports::InternalError("Failed to access terms in lexicon."));
-
- term_metadata_list.emplace_back(term_iterator.GetKey(),
- lite_index_->CountHits(term_id));
+ ICING_ASSIGN_OR_RETURN(int hit_count,
+ lite_index_->CountHits(term_id, namespace_checker));
+ if (hit_count > 0) {
+ // There is at least one document in the given namespace has this term.
+ term_metadata_list.push_back(
+ TermMetadata(term_iterator.GetKey(), hit_count));
+ }
term_iterator.Advance();
}
@@ -261,21 +237,20 @@ Index::FindLiteTermsByPrefix(const std::string& prefix,
}
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
-Index::FindTermsByPrefix(const std::string& prefix,
- const std::vector<NamespaceId>& namespace_ids,
- int num_to_return) {
+Index::FindTermsByPrefix(const std::string& prefix, int num_to_return,
+ TermMatchType::Code term_match_type,
+ const NamespaceChecker* namespace_checker) {
std::vector<TermMetadata> term_metadata_list;
if (num_to_return <= 0) {
return term_metadata_list;
}
-
// Get results from the LiteIndex.
ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> lite_term_metadata_list,
- FindLiteTermsByPrefix(prefix, namespace_ids));
+ FindLiteTermsByPrefix(prefix, namespace_checker));
// Append results from the MainIndex.
ICING_ASSIGN_OR_RETURN(std::vector<TermMetadata> main_term_metadata_list,
- main_index_->FindTermsByPrefix(prefix, namespace_ids));
-
+ main_index_->FindTermsByPrefix(prefix, term_match_type,
+ namespace_checker));
return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list),
std::move(main_term_metadata_list),
num_to_return);
@@ -284,11 +259,7 @@ Index::FindTermsByPrefix(const std::string& prefix,
IndexStorageInfoProto Index::GetStorageInfo() const {
IndexStorageInfoProto storage_info;
int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str());
- if (directory_size != Filesystem::kBadFileSize) {
- storage_info.set_index_size(directory_size);
- } else {
- storage_info.set_index_size(-1);
- }
+ storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size));
storage_info = lite_index_->GetStorageInfo(std::move(storage_info));
return main_index_->GetStorageInfo(std::move(storage_info));
}
diff --git a/icing/index/index.h b/icing/index/index.h
index 693cf04..5c53349 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -32,10 +32,12 @@
#include "icing/index/term-id-codec.h"
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/crc32.h"
@@ -142,9 +144,14 @@ class Index {
// index.
// verbosity > 0, more detailed debug information including raw postings
// lists.
- void GetDebugInfo(int verbosity, std::string* out) const {
- lite_index_->GetDebugInfo(verbosity, out);
- main_index_->GetDebugInfo(verbosity, out);
+ IndexDebugInfoProto GetDebugInfo(int verbosity) const {
+ IndexDebugInfoProto debug_info;
+ *debug_info.mutable_index_storage_info() = GetStorageInfo();
+ *debug_info.mutable_lite_index_info() =
+ lite_index_->GetDebugInfo(verbosity);
+ *debug_info.mutable_main_index_info() =
+ main_index_->GetDebugInfo(verbosity);
+ return debug_info;
}
// Returns the byte size of the all the elements held in the index. This
@@ -181,17 +188,17 @@ class Index {
TermMatchType::Code term_match_type);
// Finds terms with the given prefix in the given namespaces. If
- // 'namespace_ids' is empty, returns results from all the namespaces. The
- // input prefix must be normalized, otherwise inaccurate results may be
- // returned. Results are not sorted specifically and are in their original
- // order. Number of results are no more than 'num_to_return'.
+ // 'namespace_ids' is empty, returns results from all the namespaces. Results
+ // are sorted in decreasing order of hit count. Number of results are no more
+ // than 'num_to_return'.
//
// Returns:
// A list of TermMetadata on success
// INTERNAL_ERROR if failed to access term data.
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
- const std::string& prefix, const std::vector<NamespaceId>& namespace_ids,
- int num_to_return);
+ const std::string& prefix, int num_to_return,
+ TermMatchType::Code term_match_type,
+ const NamespaceChecker* namespace_checker);
// A class that can be used to add hits to the index.
//
@@ -267,7 +274,7 @@ class Index {
filesystem_(filesystem) {}
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix(
- const std::string& prefix, const std::vector<NamespaceId>& namespace_ids);
+ const std::string& prefix, const NamespaceChecker* namespace_checker);
std::unique_ptr<LiteIndex> lite_index_;
std::unique_ptr<MainIndex> main_index_;
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index 00d5ad6..8355c01 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -31,10 +31,12 @@
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/testing/always-true-namespace-checker-impl.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/tmp-directory.h"
@@ -89,22 +91,9 @@ constexpr DocumentId kDocumentId5 = 5;
constexpr DocumentId kDocumentId6 = 6;
constexpr DocumentId kDocumentId7 = 7;
constexpr DocumentId kDocumentId8 = 8;
-constexpr DocumentId kDocumentId9 = 9;
-constexpr DocumentId kDocumentId10 = 10;
-constexpr DocumentId kDocumentId11 = 11;
-constexpr DocumentId kDocumentId12 = 12;
constexpr SectionId kSectionId2 = 2;
constexpr SectionId kSectionId3 = 3;
-// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock(
-// GetBlockSize(),
-// GetPostingListIndexBits(posting_list_utils::min_posting_list_size()));
-constexpr int kMinSizePlApproxHits = 3;
-// The value returned by IndexBlock::ApproximateFullPostingListHitsForBlock(
-// GetBlockSize(),
-// GetPostingListIndexBits(2 * posting_list_utils::min_posting_list_size()));
-constexpr int kSecondSmallestPlApproxHits = 7;
-
std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
std::vector<DocHitInfo> infos;
while (iterator->Advance().ok()) {
@@ -920,148 +909,82 @@ TEST_F(IndexTest, InvalidHitBufferSize) {
TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) {
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/0),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*num_to_return=*/0,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/-1),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
+ /*num_to_return=*/-1,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(IsEmpty()));
ICING_ASSERT_OK(index_->Merge());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/0),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
+ /*num_to_return=*/0,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/-1),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo",
+ /*num_to_return=*/-1,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(IsEmpty()));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// "b" should only match "bar" but not "foo".
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
ICING_ASSERT_OK(index_->Merge());
// "b" should only match "bar" but not "foo".
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("bar", kMinSizePlApproxHits))));
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("fo"), IsOk());
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// We have 3 results but only 2 should be returned.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/2),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/2,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(SizeIs(2)));
ICING_ASSERT_OK(index_->Merge());
// We have 3 results but only 2 should be returned.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/2),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/2,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(SizeIs(2)));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) {
- Index::Editor edit1 =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
- EXPECT_THAT(edit1.BufferTerm("foo"), IsOk());
- EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit2 =
- index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/1);
- EXPECT_THAT(edit2.BufferTerm("fool"), IsOk());
- EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
-
- // namespace with id 0 has 2 results.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
- EqualsTermMetadata("foo", 1))));
- // namespace with id 1 has 1 result.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
-
- ICING_ASSERT_OK(index_->Merge());
-
- // namespace with id 0 has 2 results.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fo", kMinSizePlApproxHits),
- EqualsTermMetadata("foo", kMinSizePlApproxHits))));
- // namespace with id 1 has 1 result.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fool", kMinSizePlApproxHits))));
-}
-
-TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
- Index::Editor edit1 =
- index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
- EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit2 =
- index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/1);
- EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
- EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
-
- Index::Editor edit3 =
- index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/2);
- EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
- EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
-
- // Should return "foo" and "fool" which are in namespaces with ids 1 and 2.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
-
- ICING_ASSERT_OK(index_->Merge());
-
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", kMinSizePlApproxHits))));
-}
-
TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
@@ -1078,8 +1001,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
// Should return "fo", "foo" and "fool" across all namespaces.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(UnorderedElementsAre(
EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
EqualsTermMetadata("fool", 1))));
@@ -1087,18 +1011,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
ICING_ASSERT_OK(index_->Merge());
// Should return "fo", "foo" and "fool" across all namespaces.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fo", kMinSizePlApproxHits),
- EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", kMinSizePlApproxHits))));
+ EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit1.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit1.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
@@ -1110,20 +1035,19 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
// 'foo' has 1 hit, 'fool' has 2 hits.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
EqualsTermMetadata("foo", 1))));
ICING_ASSERT_OK(index_->Merge());
- // foo's one hit should fit on a min-sized pl, fool's two hits should also fit
- // on a min-sized pl.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", kMinSizePlApproxHits))));
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
+ EqualsTermMetadata("foo", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
@@ -1132,6 +1056,7 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit1.BufferTerm("term-one"), IsOk());
EXPECT_THAT(edit1.BufferTerm("term-two"), IsOk());
EXPECT_THAT(edit1.BufferTerm("term-three"), IsOk());
@@ -1181,8 +1106,9 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
EXPECT_THAT(edit6.IndexAllBufferedTerms(), IsOk());
// verify the order in lite index is correct.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6),
EqualsTermMetadata("term-five", 5),
EqualsTermMetadata("term-four", 4),
@@ -1192,93 +1118,97 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
ICING_ASSERT_OK(index_->Merge());
- // Since most of term has same approx hit count, we don't verify order in the
- // main index.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits),
- EqualsTermMetadata("term-five", kSecondSmallestPlApproxHits),
- EqualsTermMetadata("term-four", kMinSizePlApproxHits),
- EqualsTermMetadata("term-three", kMinSizePlApproxHits),
- EqualsTermMetadata("term-two", kMinSizePlApproxHits),
- EqualsTermMetadata("term-one", kMinSizePlApproxHits))));
-
- // keep push terms to the lite index. For term 1-4, since they has same hit
- // count kMinSizePlApproxHits, we will push 4 term-one, 3 term-two, 2
- // term-three and one term-four to make them in reverse order. And for term
- // 5 & 6, we will push 2 term-five and one term-six.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-five", 5),
+ EqualsTermMetadata("term-four", 4),
+ EqualsTermMetadata("term-three", 3),
+ EqualsTermMetadata("term-two", 2),
+ EqualsTermMetadata("term-one", 1))));
+
+ // keep push terms to the lite index. We will add 2 document to term-five,
+ // term-three and term-one. The output order should be 5-6-3-4-1-2.
Index::Editor edit7 =
index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
EXPECT_THAT(edit7.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit7.BufferTerm("term-two"), IsOk());
EXPECT_THAT(edit7.BufferTerm("term-three"), IsOk());
- EXPECT_THAT(edit7.BufferTerm("term-four"), IsOk());
+ EXPECT_THAT(edit7.BufferTerm("term-five"), IsOk());
EXPECT_THAT(edit7.IndexAllBufferedTerms(), IsOk());
Index::Editor edit8 =
index_->Edit(kDocumentId8, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
EXPECT_THAT(edit8.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit8.BufferTerm("term-two"), IsOk());
EXPECT_THAT(edit8.BufferTerm("term-three"), IsOk());
+ EXPECT_THAT(edit8.BufferTerm("term-five"), IsOk());
EXPECT_THAT(edit8.IndexAllBufferedTerms(), IsOk());
- Index::Editor edit9 =
- index_->Edit(kDocumentId9, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit9.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit9.BufferTerm("term-two"), IsOk());
- EXPECT_THAT(edit9.IndexAllBufferedTerms(), IsOk());
+ // verify the combination of lite index and main index is in correct order.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"t", /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(
+ EqualsTermMetadata("term-five", 7), EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-three", 5),
+ EqualsTermMetadata("term-four", 4), EqualsTermMetadata("term-one", 3),
+ EqualsTermMetadata("term-two", 2))));
- Index::Editor edit10 =
- index_->Edit(kDocumentId10, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/0);
- EXPECT_THAT(edit10.BufferTerm("term-one"), IsOk());
- EXPECT_THAT(edit10.IndexAllBufferedTerms(), IsOk());
+ // Get the first three terms.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"t",
+ /*num_to_return=*/3,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-five", 7),
+ EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-three", 5))));
+}
- Index::Editor edit11 =
- index_->Edit(kDocumentId11, kSectionId2, TermMatchType::EXACT_ONLY,
+TEST_F(IndexTest, FindTermByPrefix_InTermMatchTypePrefix_ShouldReturnInOrder) {
+ Index::Editor edit1 =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- EXPECT_THAT(edit11.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit11.BufferTerm("term-six"), IsOk());
- EXPECT_THAT(edit11.IndexAllBufferedTerms(), IsOk());
+ AlwaysTrueNamespaceCheckerImpl impl;
+ EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
- Index::Editor edit12 =
- index_->Edit(kDocumentId12, kSectionId2, TermMatchType::EXACT_ONLY,
+ Index::Editor edit2 =
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- EXPECT_THAT(edit12.BufferTerm("term-five"), IsOk());
- EXPECT_THAT(edit12.IndexAllBufferedTerms(), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
- // verify the combination of lite index and main index is in correct order.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(ElementsAre(
- EqualsTermMetadata("term-five",
- kSecondSmallestPlApproxHits + 2), // 9
- EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits + 1), // 8
- EqualsTermMetadata("term-one", kMinSizePlApproxHits + 4), // 7
- EqualsTermMetadata("term-two", kMinSizePlApproxHits + 3), // 6
- EqualsTermMetadata("term-three", kMinSizePlApproxHits + 2), // 5
- EqualsTermMetadata("term-four", kMinSizePlApproxHits + 1)))); // 4
+ Index::Editor edit3 =
+ index_->Edit(kDocumentId3, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
- // Get the first three terms.
- EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"t", /*namespace_ids=*/{0},
- /*num_to_return=*/3),
- IsOkAndHolds(ElementsAre(
- EqualsTermMetadata("term-five",
- kSecondSmallestPlApproxHits + 2), // 9
- EqualsTermMetadata("term-six", kSecondSmallestPlApproxHits + 1), // 8
- EqualsTermMetadata("term-one", kMinSizePlApproxHits + 4)))); // 7
+ ICING_ASSERT_OK(index_->Merge());
+ // verify the order in pls is correct
+ // "fo" { {doc0, exact_hit}, {doc1, prefix_hit}, {doc2, prefix_hit} }
+ // "foo" { {doc1, exact_hit}, {doc2, prefix_hit} }
+ // "fool" { {doc2, exact_hit} }
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3),
+ EqualsTermMetadata("foo", 2),
+ EqualsTermMetadata("fool", 1))));
+ // Find by exact only, all terms should be equally.
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
+ TermMatchType::EXACT_ONLY, &impl),
+ IsOkAndHolds(UnorderedElementsAre(
+ EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) {
+TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1313,25 +1243,26 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnApproximateHitCountForMain) {
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// 'foo' has 1 hit, 'fool' has 8 hits.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f",
+ /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 8),
EqualsTermMetadata("foo", 1))));
ICING_ASSERT_OK(index_->Merge());
- // foo's hits should fit on a single pl. fool's hits will need two pls.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", kSecondSmallestPlApproxHits))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 8))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1343,19 +1274,18 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) {
EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the main index and
- // 1 hit in the lite index.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(ElementsAre(
- EqualsTermMetadata("fool", kMinSizePlApproxHits + 1),
- EqualsTermMetadata("foo", kMinSizePlApproxHits))));
+ EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
+ EqualsTermMetadata("foo", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) {
Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ AlwaysTrueNamespaceCheckerImpl impl;
+
EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
@@ -1368,10 +1298,10 @@ TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) {
// 'foo' has 1 hit in the main index, 'fool' has 1 hit in the lite index.
EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(ElementsAre(EqualsTermMetadata("foo", kMinSizePlApproxHits),
- EqualsTermMetadata("fool", 1))));
+ index_->FindTermsByPrefix(/*prefix=*/"f", /*num_to_return=*/10,
+ TermMatchType::PREFIX, &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, GetElementsSize) {
@@ -1465,12 +1395,14 @@ TEST_F(IndexTest, GetDebugInfo) {
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId1);
ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
ICING_ASSERT_OK(index_->Merge());
edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId2);
ASSERT_THAT(edit.BufferTerm("footer"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
@@ -1478,40 +1410,45 @@ TEST_F(IndexTest, GetDebugInfo) {
ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- std::string out0;
- index_->GetDebugInfo(/*verbosity=*/0, &out0);
- EXPECT_THAT(out0, Not(IsEmpty()));
+ IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info());
+ EXPECT_THAT(out0.main_index_info().last_added_document_id(),
+ Eq(kDocumentId1));
+ EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2));
+ EXPECT_THAT(out0.lite_index_info().last_added_document_id(),
+ Eq(kDocumentId2));
- std::string out1;
- index_->GetDebugInfo(/*verbosity=*/1, &out1);
- EXPECT_THAT(out1, SizeIs(Gt(out0.size())));
+ IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1);
+ EXPECT_THAT(out1.main_index_info().flash_index_storage_info(),
+ Not(IsEmpty()));
// Add one more doc to the lite index. Debug strings should change.
edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId3);
ASSERT_THAT(edit.BufferTerm("far"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- std::string out2;
- index_->GetDebugInfo(/*verbosity=*/0, &out2);
- EXPECT_THAT(out2, Ne(out0));
-
- std::string out3;
- index_->GetDebugInfo(/*verbosity=*/1, &out3);
- EXPECT_THAT(out3, Ne(out1));
+ IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3));
+ EXPECT_THAT(out2.lite_index_info().last_added_document_id(),
+ Eq(kDocumentId3));
// Merge into the man index. Debuug strings should change again.
ICING_ASSERT_OK(index_->Merge());
- std::string out4;
- index_->GetDebugInfo(/*verbosity=*/0, &out4);
- EXPECT_THAT(out4, Ne(out0));
- EXPECT_THAT(out4, Ne(out2));
-
- std::string out5;
- index_->GetDebugInfo(/*verbosity=*/1, &out5);
- EXPECT_THAT(out5, Ne(out1));
- EXPECT_THAT(out5, Ne(out3));
+ IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_TRUE(out3.has_index_storage_info());
+ EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty()));
+ EXPECT_THAT(out3.main_index_info().last_added_document_id(),
+ Eq(kDocumentId3));
+ EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0));
+ EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0));
+ EXPECT_THAT(out3.lite_index_info().last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0));
+ EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0));
+ EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty()));
}
TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) {
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
index 43a846b..7c6d924 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
@@ -48,13 +48,13 @@ using ::testing::ElementsAreArray;
using ::testing::Eq;
using ::testing::IsEmpty;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
protected:
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
index 08df4fc..f215d63 100644
--- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -77,7 +77,8 @@ libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
ICING_ASSIGN_OR_RETURN(uint32_t term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
lite_index_->AppendHits(term_id, section_restrict_mask_,
- /*only_from_prefix_sections=*/false, &cached_hits_);
+ /*only_from_prefix_sections=*/false,
+ /*namespace_checker=*/nullptr, &cached_hits_);
cached_hits_idx_ = 0;
return libtextclassifier3::Status::OK;
}
@@ -100,7 +101,7 @@ DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
lite_index_->AppendHits(term_id, section_restrict_mask_,
/*only_from_prefix_sections=*/!exact_match,
- &cached_hits_);
+ /*namespace_checker=*/nullptr, &cached_hits_);
++terms_matched;
}
if (terms_matched > 1) {
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index 9e4ac28..a5c6baf 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -336,9 +336,12 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
bool only_from_prefix_sections,
+ const NamespaceChecker* namespace_checker,
std::vector<DocHitInfo>* hits_out) {
int count = 0;
DocumentId last_document_id = kInvalidDocumentId;
+ // Record whether the last document belongs to the given namespaces.
+ bool last_document_in_namespace = false;
for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
TermIdHitPair term_id_hit_pair(
hit_buffer_.array_cast<TermIdHitPair>()[idx]);
@@ -355,22 +358,31 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
}
DocumentId document_id = hit.document_id();
if (document_id != last_document_id) {
+ last_document_id = document_id;
+ last_document_in_namespace =
+ namespace_checker == nullptr ||
+ namespace_checker->BelongsToTargetNamespaces(document_id);
+ if (!last_document_in_namespace) {
+ // The document is removed or expired or not belongs to target
+ // namespaces.
+ continue;
+ }
++count;
if (hits_out != nullptr) {
hits_out->push_back(DocHitInfo(document_id));
}
- last_document_id = document_id;
}
- if (hits_out != nullptr) {
+ if (hits_out != nullptr && last_document_in_namespace) {
hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency());
}
}
return count;
}
-int LiteIndex::CountHits(uint32_t term_id) {
+libtextclassifier3::StatusOr<int> LiteIndex::CountHits(
+ uint32_t term_id, const NamespaceChecker* namespace_checker) {
return AppendHits(term_id, kSectionIdMaskAll,
- /*only_from_prefix_sections=*/false,
+ /*only_from_prefix_sections=*/false, namespace_checker,
/*hits_out=*/nullptr);
}
@@ -379,15 +391,16 @@ bool LiteIndex::is_full() const {
lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
}
-void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
- absl_ports::StrAppend(
- out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n",
- header_->cur_size(),
- options_.hit_buffer_size));
-
- // Lexicon.
- out->append("Lexicon stats:\n");
- lexicon_.GetDebugInfo(verbosity, out);
+IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo(
+ int verbosity) {
+ IndexDebugInfoProto::LiteIndexDebugInfoProto res;
+ res.set_curr_size(header_->cur_size());
+ res.set_hit_buffer_size(options_.hit_buffer_size);
+ res.set_last_added_document_id(header_->last_added_docid());
+ res.set_searchable_end(header_->searchable_end());
+ res.set_index_crc(ComputeChecksum().Get());
+ lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info());
+ return res;
}
libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
@@ -408,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
int64_t header_and_hit_buffer_file_size =
filesystem_->GetFileSize(hit_buffer_fd_.get());
- if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) {
- storage_info.set_lite_index_hit_buffer_size(
- header_and_hit_buffer_file_size);
- } else {
- storage_info.set_lite_index_hit_buffer_size(-1);
- }
+ storage_info.set_lite_index_hit_buffer_size(
+ IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
if (lexicon_disk_usage != Filesystem::kBadFileSize) {
storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index b134aba..378fc94 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -37,10 +37,12 @@
#include "icing/legacy/index/icing-lite-index-header.h"
#include "icing/legacy/index/icing-lite-index-options.h"
#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/bit-util.h"
#include "icing/util/crc32.h"
@@ -140,13 +142,19 @@ class LiteIndex {
// skipping hits in non-prefix sections if only_from_prefix_sections is true,
// to hits_out. If hits_out is nullptr, no hits will be added.
//
+ // Only those hits which belongs to the given namespaces will be counted and
+ // appended. A nullptr namespace checker will disable this check.
+ //
// Returns the number of hits that would be added to hits_out.
int AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
bool only_from_prefix_sections,
+ const NamespaceChecker* namespace_checker,
std::vector<DocHitInfo>* hits_out);
// Returns the hit count of the term.
- int CountHits(uint32_t term_id);
+ // Only those hits which belongs to the given namespaces will be counted.
+ libtextclassifier3::StatusOr<int> CountHits(
+ uint32_t term_id, const NamespaceChecker* namespace_checker);
// Check if buffer has reached its capacity.
bool is_full() const;
@@ -234,7 +242,7 @@ class LiteIndex {
// Returns debug information for the index in out.
// verbosity <= 0, simplest debug information - size of lexicon, hit buffer
// verbosity > 0, more detailed debug information from the lexicon.
- void GetDebugInfo(int verbosity, std::string* out) const;
+ IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity);
// Returns the byte size of all the elements held in the index. This excludes
// the size of any internal metadata of the index, e.g. the index's header.
diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc
new file mode 100644
index 0000000..825f830
--- /dev/null
+++ b/icing/index/lite/lite-index_test.cc
@@ -0,0 +1,110 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/lite-index.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-checker.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker {
+ public:
+ bool BelongsToTargetNamespaces(DocumentId document_id) const override {
+ return false;
+ }
+};
+
+class LiteIndexTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(LiteIndexTest, LiteIndexAppendHits) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1));
+
+ std::vector<DocHitInfo> hits1;
+ lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ /*namespace_checker=*/nullptr, &hits1);
+ EXPECT_THAT(hits1, SizeIs(1));
+ EXPECT_THAT(hits1.back().document_id(), Eq(0));
+ // Check that the hits are coming from section 0 and section 1.
+ EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11));
+
+ std::vector<DocHitInfo> hits2;
+ AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker;
+ lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ &always_false_namespace_checker, &hits2);
+ // Check that no hits are returned because they get skipped by the namespace
+ // checker.
+ EXPECT_THAT(hits2, IsEmpty());
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h
index 8d5b50b..6c6fbb8 100644
--- a/icing/index/main/flash-index-storage.h
+++ b/icing/index/main/flash-index-storage.h
@@ -159,6 +159,7 @@ class FlashIndexStorage {
libtextclassifier3::Status Reset();
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
void GetDebugInfo(int verbosity, std::string* out) const;
private:
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
index b185138..2d6007b 100644
--- a/icing/index/main/main-index.cc
+++ b/icing/index/main/main-index.cc
@@ -133,18 +133,10 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
IndexStorageInfoProto MainIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
- int64_t lexicon_elt_size = main_lexicon_->GetElementsSize();
- if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
- storage_info.set_main_index_lexicon_size(lexicon_elt_size);
- } else {
- storage_info.set_main_index_lexicon_size(-1);
- }
- int64_t index_elt_size = flash_index_storage_->GetElementsSize();
- if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
- storage_info.set_main_index_storage_size(index_elt_size);
- } else {
- storage_info.set_main_index_storage_size(-1);
- }
+ storage_info.set_main_index_lexicon_size(
+ IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
+ storage_info.set_main_index_storage_size(
+ Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
storage_info.set_main_index_block_size(flash_index_storage_->block_size());
storage_info.set_num_blocks(flash_index_storage_->num_blocks());
storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
@@ -186,7 +178,7 @@ MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
// Found it, but it doesn't have prefix hits. Exit early. No need to
// retrieve the posting list because there's nothing there for us.
- return libtextclassifier3::Status::OK;
+ return absl_ports::NotFoundError("The term doesn't have any prefix hits.");
}
PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
@@ -217,35 +209,45 @@ bool IsTermInNamespaces(
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
MainIndex::FindTermsByPrefix(const std::string& prefix,
- const std::vector<NamespaceId>& namespace_ids) {
+ TermMatchType::Code term_match_type,
+ const NamespaceChecker* namespace_checker) {
// Finds all the terms that start with the given prefix in the lexicon.
IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str());
- // A property reader to help check if a term has some property.
- IcingDynamicTrie::PropertyReadersAll property_reader(*main_lexicon_);
-
std::vector<TermMetadata> term_metadata_list;
while (term_iterator.IsValid()) {
- uint32_t term_value_index = term_iterator.GetValueIndex();
+ int count = 0;
+ DocumentId last_document_id = kInvalidDocumentId;
- // Skips the terms that don't exist in the given namespaces. We won't skip
- // any terms if namespace_ids is empty.
- if (!IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
- term_iterator.Advance();
- continue;
- }
PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
- // Getting the actual hit count would require reading the entire posting
- // list chain. We take an approximation to avoid all of those IO ops.
- // Because we are not reading the posting lists, it is impossible to
- // differentiate between single max-size posting lists and chains of
- // max-size posting lists. We assume that the impact on scoring is not
- // significant.
- int approx_hit_count = IndexBlock::ApproximateFullPostingListHitsForBlock(
- flash_index_storage_->block_size(),
- posting_list_id.posting_list_index_bits());
- term_metadata_list.emplace_back(term_iterator.GetKey(), approx_hit_count);
+ ICING_ASSIGN_OR_RETURN(PostingListAccessor pl_accessor,
+ PostingListAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_id));
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ pl_accessor.GetNextHitsBatch());
+ for (const Hit& hit : hits) {
+ DocumentId document_id = hit.document_id();
+ if (document_id != last_document_id) {
+ last_document_id = document_id;
+ if (term_match_type == TermMatchType::EXACT_ONLY &&
+ hit.is_prefix_hit()) {
+ continue;
+ }
+ if (!namespace_checker->BelongsToTargetNamespaces(document_id)) {
+ // The document is removed or expired or not belongs to target
+ // namespaces.
+ continue;
+ }
+ // TODO(b/152934343) Add search type in SuggestionSpec to ask user to
+ // input search type, prefix or exact. And make different score strategy
+ // base on that.
+ ++count;
+ }
+ }
+ if (count > 0) {
+ term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), count));
+ }
term_iterator.Advance();
}
@@ -605,16 +607,22 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
return libtextclassifier3::Status::OK;
}
-void MainIndex::GetDebugInfo(int verbosity, std::string* out) const {
+IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo(
+ int verbosity) const {
+ IndexDebugInfoProto::MainIndexDebugInfoProto res;
+
// Lexicon.
- out->append("Main Lexicon stats:\n");
- main_lexicon_->GetDebugInfo(verbosity, out);
+ main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info());
+
+ res.set_last_added_document_id(last_added_document_id());
if (verbosity <= 0) {
- return;
+ return res;
}
- flash_index_storage_->GetDebugInfo(verbosity, out);
+ flash_index_storage_->GetDebugInfo(verbosity,
+ res.mutable_flash_index_storage_info());
+ return res;
}
} // namespace lib
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
index 919a5c5..abb0418 100644
--- a/icing/index/main/main-index.h
+++ b/icing/index/main/main-index.h
@@ -27,7 +27,9 @@
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
+#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/status-macros.h"
@@ -71,17 +73,17 @@ class MainIndex {
// Finds terms with the given prefix in the given namespaces. If
// 'namespace_ids' is empty, returns results from all the namespaces. The
// input prefix must be normalized, otherwise inaccurate results may be
- // returned. Results are not sorted specifically and are in lexigraphical
- // order. Number of results are no more than 'num_to_return'.
- //
- // The hit count returned with each TermMetadata is an approximation based of
- // posting list size.
+ // returned. If term_match_type is EXACT, only exact hit will be counted and
+ // it is PREFIX, both prefix and exact hits will be counted. Results are not
+ // sorted specifically and are in lexigraphical order. Number of results are
+ // no more than 'num_to_return'.
//
// Returns:
// A list of TermMetadata on success
// INTERNAL_ERROR if failed to access term data.
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
- const std::string& prefix, const std::vector<NamespaceId>& namespace_ids);
+ const std::string& prefix, TermMatchType::Code term_match_type,
+ const NamespaceChecker* namespace_checker);
struct LexiconMergeOutputs {
// Maps from main_lexicon tvi for new branching point to the main_lexicon
@@ -184,7 +186,8 @@ class MainIndex {
// verbosity <= 0, simplest debug information - just the lexicon
// verbosity > 0, more detailed debug information including raw postings
// lists.
- void GetDebugInfo(int verbosity, std::string* out) const;
+ IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo(
+ int verbosity) const;
private:
libtextclassifier3::Status Init(const std::string& index_directory,
diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc
index 74139be..fa83d68 100644
--- a/icing/index/main/main-index_test.cc
+++ b/icing/index/main/main-index_test.cc
@@ -162,6 +162,34 @@ TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) {
EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk());
}
+TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should return not found when we search
+ // prefix contain "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // GetAccessorForPrefixTerm should return a valid accessor for "foo".
+ EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) {
// Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index baa043a..77876c4 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -70,6 +70,7 @@
#include <algorithm>
#include <cerrno>
#include <cinttypes>
+#include <cstdint>
#include <cstring>
#include <memory>
#include <utility>
@@ -397,6 +398,8 @@ class IcingDynamicTrie::IcingDynamicTrieStorage {
// storage.
IcingScopedFd array_fds_[NUM_ARRAY_TYPES];
std::vector<IcingArrayStorage> array_storage_;
+
+ // Legacy file system. Switch to use the new Filesystem class instead.
const IcingFilesystem *filesystem_;
};
@@ -1364,10 +1367,12 @@ uint32_t IcingDynamicTrie::size() const {
return storage_->hdr().num_keys();
}
-void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
- Stats *stats) const {
+void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats,
+ uint32_t depth) const {
if (node.is_leaf()) {
stats->num_leaves++;
+ stats->sum_depth += depth;
+ stats->max_depth = max(stats->max_depth, depth);
const char *suffix = storage_->GetSuffix(node.next_index());
stats->suffixes_used += strlen(suffix) + 1 + value_size();
if (!suffix[0]) {
@@ -1379,13 +1384,16 @@ void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
for (; i < (1U << node.log2_num_children()); i++) {
const Next &next = *storage_->GetNext(node.next_index(), i);
if (next.node_index() == kInvalidNodeIndex) break;
- CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats);
+ CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats,
+ depth + 1);
}
// At least one valid node in each next array
if (i == 0) {
ICING_LOG(FATAL) << "No valid node in 'next' array";
}
+ stats->sum_children += i;
+ stats->max_children = max(stats->max_children, i);
stats->child_counts[i - 1]++;
stats->wasted[node.log2_num_children()] +=
@@ -1467,9 +1475,12 @@ std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const {
"Wasted total: %u\n"
"Num intermediates %u num leaves %u "
"suffixes used %u null %u\n"
+ "avg and max children for intermediates: %.3f, %u\n"
+ "avg and max depth for leaves: %.3f, %u\n"
"Total next frag: %.3f%%\n",
total_wasted, num_intermediates, num_leaves, suffixes_used,
- null_suffixes,
+ null_suffixes, 1. * sum_children / num_intermediates, max_children,
+ 1. * sum_depth / num_leaves, max_depth,
100. * math_util::SafeDivide((total_free + total_wasted), num_nexts));
}
IcingStringUtil::SStringAppendF(
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index 8821799..013b926 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -152,8 +152,13 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t max_nodes;
// Count of intermediate nodes.
uint32_t num_intermediates;
+ // Total and maximum number of children of intermediate nodes.
+ uint32_t sum_children, max_children;
+
// Count of leaf nodes.
uint32_t num_leaves;
+ // Total and maximum depth of leaf nodes.
+ uint32_t sum_depth, max_depth;
// Next stats
@@ -186,6 +191,7 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t dirty_pages_nexts;
uint32_t dirty_pages_suffixes;
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
std::string DumpStats(int verbosity) const;
};
@@ -601,7 +607,8 @@ class IcingDynamicTrie : public IIcingStorage {
static const uint32_t kInvalidSuffixIndex;
// Stats helpers.
- void CollectStatsRecursive(const Node &node, Stats *stats) const;
+ void CollectStatsRecursive(const Node &node, Stats *stats,
+ uint32_t depth = 0) const;
// Helpers for Find and Insert.
const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h
index f645632..ce75a82 100644
--- a/icing/legacy/index/icing-filesystem.h
+++ b/icing/legacy/index/icing-filesystem.h
@@ -224,6 +224,11 @@ class IcingFilesystem {
// Increments to_increment by size if size is valid, or sets to_increment
// to kBadFileSize if either size or to_increment is kBadFileSize.
static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment);
+
+ // Return -1 if file_size is invalid. Otherwise, return file_size.
+ static int64_t SanitizeFileSize(int64_t file_size) {
+ return (file_size != kBadFileSize) ? file_size : -1;
+ }
};
} // namespace lib
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
index e3ba0e2..6bb9591 100644
--- a/icing/legacy/index/icing-flash-bitmap.h
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -138,6 +138,7 @@ class IcingFlashBitmap {
// Upgrade for version 18.
bool UpgradeTo18();
+ // Legacy file system. Switch to use the new Filesystem class instead.
const IcingFilesystem *const filesystem_;
std::string filename_;
OpenType open_type_;
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index bdd40aa..e48fe78 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -16,7 +16,6 @@
#include "gmock/gmock.h"
#include "third_party/absl/flags/flag.h"
#include "icing/document-builder.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index.h"
#include "icing/proto/term.pb.h"
#include "icing/query/query-processor.h"
@@ -24,6 +23,7 @@
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index daeb479..950f739 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -23,7 +23,6 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
@@ -40,6 +39,7 @@
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -61,16 +61,16 @@ using ::testing::SizeIs;
using ::testing::Test;
using ::testing::UnorderedElementsAre;
-constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
- PropertyConfigProto_DataType_Code_STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
class QueryProcessorTest : public Test {
protected:
diff --git a/icing/query/suggestion-processor.cc b/icing/query/suggestion-processor.cc
index 9c60810..cfa53f6 100644
--- a/icing/query/suggestion-processor.cc
+++ b/icing/query/suggestion-processor.cc
@@ -35,7 +35,7 @@ SuggestionProcessor::Create(Index* index,
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
SuggestionProcessor::QuerySuggestions(
const icing::lib::SuggestionSpecProto& suggestion_spec,
- const std::vector<NamespaceId>& namespace_ids) {
+ const NamespaceChecker* namespace_checker) {
// We use query tokenizer to tokenize the give prefix, and we only use the
// last token to be the suggestion prefix.
ICING_ASSIGN_OR_RETURN(
@@ -73,8 +73,11 @@ SuggestionProcessor::QuerySuggestions(
// lowercase.
ICING_ASSIGN_OR_RETURN(
std::vector<TermMetadata> terms,
- index_.FindTermsByPrefix(normalizer_.NormalizeTerm(last_token),
- namespace_ids, suggestion_spec.num_to_return()));
+ index_.FindTermsByPrefix(
+ normalizer_.NormalizeTerm(last_token),
+ suggestion_spec.num_to_return(),
+ suggestion_spec.scoring_spec().scoring_match_type(),
+ namespace_checker));
for (TermMetadata& term : terms) {
term.content = query_prefix + term.content;
@@ -90,4 +93,4 @@ SuggestionProcessor::SuggestionProcessor(
normalizer_(*normalizer) {}
} // namespace lib
-} // namespace icing \ No newline at end of file
+} // namespace icing
diff --git a/icing/query/suggestion-processor.h b/icing/query/suggestion-processor.h
index b10dc84..088863e 100644
--- a/icing/query/suggestion-processor.h
+++ b/icing/query/suggestion-processor.h
@@ -48,7 +48,7 @@ class SuggestionProcessor {
// INTERNAL_ERROR on all other errors
libtextclassifier3::StatusOr<std::vector<TermMetadata>> QuerySuggestions(
const SuggestionSpecProto& suggestion_spec,
- const std::vector<NamespaceId>& namespace_ids);
+ const NamespaceChecker* namespace_checker);
private:
explicit SuggestionProcessor(Index* index,
diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc
index 5e62277..ba4c90a 100644
--- a/icing/query/suggestion-processor_test.cc
+++ b/icing/query/suggestion-processor_test.cc
@@ -15,10 +15,11 @@
#include "icing/query/suggestion-processor.h"
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/store/document-store.h"
+#include "icing/testing/always-true-namespace-checker-impl.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -80,7 +81,6 @@ class SuggestionProcessorTest : public Test {
DocumentStore::CreateResult create_result,
DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
schema_store_.get()));
- document_store_ = std::move(create_result.document_store);
}
libtextclassifier3::Status AddTokenToIndex(
@@ -93,7 +93,6 @@ class SuggestionProcessorTest : public Test {
}
void TearDown() override {
- document_store_.reset();
filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
}
@@ -103,7 +102,6 @@ class SuggestionProcessorTest : public Test {
std::unique_ptr<Index> index_;
std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<Normalizer> normalizer_;
- std::unique_ptr<DocumentStore> document_store_;
std::unique_ptr<SchemaStore> schema_store_;
std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
FakeClock fake_clock_;
@@ -131,9 +129,10 @@ TEST_F(SuggestionProcessorTest, PrependedPrefixTokenTest) {
"prefix token should be prepended to the suggestion f");
suggestion_spec.set_num_to_return(10);
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ AlwaysTrueNamespaceCheckerImpl impl;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms.at(0).content,
"prefix token should be prepended to the suggestion foo");
}
@@ -152,9 +151,10 @@ TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) {
suggestion_spec.set_prefix("nonExistTerm");
suggestion_spec.set_num_to_return(10);
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ AlwaysTrueNamespaceCheckerImpl impl;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms, IsEmpty());
}
@@ -173,9 +173,10 @@ TEST_F(SuggestionProcessorTest, PrefixTrailingSpaceTest) {
suggestion_spec.set_prefix("f ");
suggestion_spec.set_num_to_return(10);
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ AlwaysTrueNamespaceCheckerImpl impl;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms, IsEmpty());
}
@@ -193,28 +194,26 @@ TEST_F(SuggestionProcessorTest, NormalizePrefixTest) {
SuggestionSpecProto suggestion_spec;
suggestion_spec.set_prefix("F");
suggestion_spec.set_num_to_return(10);
+
+ AlwaysTrueNamespaceCheckerImpl impl;
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(suggestion_spec,
- /*namespace_ids=*/{}));
+ suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms.at(0).content, "foo");
suggestion_spec.set_prefix("fO");
ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec,
- /*namespace_ids=*/{}));
+ terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms.at(0).content, "foo");
suggestion_spec.set_prefix("Fo");
ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec,
- /*namespace_ids=*/{}));
+ terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms.at(0).content, "foo");
suggestion_spec.set_prefix("FO");
ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec,
- /*namespace_ids=*/{}));
+ terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms.at(0).content, "foo");
}
@@ -235,9 +234,10 @@ TEST_F(SuggestionProcessorTest, OrOperatorPrefixTest) {
suggestion_spec.set_prefix("f OR");
suggestion_spec.set_num_to_return(10);
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ AlwaysTrueNamespaceCheckerImpl impl;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
// Last Operator token will be used to query suggestion
EXPECT_THAT(terms.at(0).content, "f original");
@@ -256,19 +256,20 @@ TEST_F(SuggestionProcessorTest, ParenthesesOperatorPrefixTest) {
suggestion_spec.set_prefix("{f}");
suggestion_spec.set_num_to_return(10);
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ AlwaysTrueNamespaceCheckerImpl impl;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms, IsEmpty());
suggestion_spec.set_prefix("[f]");
- ICING_ASSERT_OK_AND_ASSIGN(terms, suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms, IsEmpty());
suggestion_spec.set_prefix("(f)");
- ICING_ASSERT_OK_AND_ASSIGN(terms, suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms, IsEmpty());
}
@@ -286,15 +287,15 @@ TEST_F(SuggestionProcessorTest, OtherSpecialPrefixTest) {
suggestion_spec.set_prefix("f:");
suggestion_spec.set_num_to_return(10);
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ AlwaysTrueNamespaceCheckerImpl impl;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms, IsEmpty());
suggestion_spec.set_prefix("f-");
ICING_ASSERT_OK_AND_ASSIGN(
- terms, suggestion_processor->QuerySuggestions(suggestion_spec,
- /*namespace_ids=*/{}));
+ terms, suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms, IsEmpty());
}
@@ -312,9 +313,10 @@ TEST_F(SuggestionProcessorTest, InvalidPrefixTest) {
suggestion_spec.set_prefix("OR OR - :");
suggestion_spec.set_num_to_return(10);
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms,
- suggestion_processor->QuerySuggestions(
- suggestion_spec, /*namespace_ids=*/{}));
+ AlwaysTrueNamespaceCheckerImpl impl;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor->QuerySuggestions(suggestion_spec, &impl));
EXPECT_THAT(terms, IsEmpty());
}
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 1c9684d..0d812e4 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -22,7 +22,6 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -36,6 +35,7 @@
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -55,14 +55,14 @@ using ::testing::IsEmpty;
using ::testing::Return;
using ::testing::SizeIs;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
class ResultRetrieverTest : public testing::Test {
protected:
@@ -160,7 +160,7 @@ ResultSpecProto::SnippetSpecProto CreateSnippetSpec() {
ResultSpecProto::SnippetSpecProto snippet_spec;
snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max());
snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max());
- snippet_spec.set_max_window_bytes(1024);
+ snippet_spec.set_max_window_utf32_length(1024);
return snippet_spec;
}
@@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) {
TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) {
MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false));
-
+ ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>()))
+ .WillByDefault(Return(false));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_,
diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc
index 32e45aa..8a9005d 100644
--- a/icing/result/result-state-manager_test.cc
+++ b/icing/result/result-state-manager_test.cc
@@ -849,7 +849,7 @@ TEST_F(ResultStateManagerTest, ShouldGetSnippetContext) {
ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
@@ -884,7 +884,7 @@ TEST_F(ResultStateManagerTest, ShouldGetDefaultSnippetContext) {
// 0 indicates no snippeting
result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(0);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(0);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(0);
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc
index f2121a5..d92fcfa 100644
--- a/icing/result/result-state_test.cc
+++ b/icing/result/result-state_test.cc
@@ -143,7 +143,7 @@ TEST_F(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) {
ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
SectionRestrictQueryTermsMap query_terms_map;
query_terms_map.emplace("term1", std::unordered_set<std::string>());
@@ -178,7 +178,7 @@ TEST_F(ResultStateTest, NoSnippetingShouldReturnNull) {
// stored.
result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
SectionRestrictQueryTermsMap query_terms_map;
query_terms_map.emplace("term1", std::unordered_set<std::string>());
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index c46762e..bd1524e 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -41,6 +41,7 @@
#include "icing/transform/normalizer.h"
#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -75,6 +76,67 @@ inline std::string AddIndexToPath(int values_size, int index,
kRBracket);
}
+// Returns a string of the normalized text of the input Token. Normalization
+// is applied based on the Token's type.
+std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
+ switch (token.type) {
+ case Token::Type::REGULAR:
+ return normalizer.NormalizeTerm(token.text);
+ case Token::Type::VERBATIM:
+ return std::string(token.text);
+ case Token::Type::QUERY_EXCLUSION:
+ [[fallthrough]];
+ case Token::Type::QUERY_LEFT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_OR:
+ [[fallthrough]];
+ case Token::Type::QUERY_PROPERTY:
+ [[fallthrough]];
+ case Token::Type::INVALID:
+ ICING_LOG(WARNING) << "Unable to normalize token of type: "
+ << static_cast<int>(token.type);
+ return std::string(token.text);
+ }
+}
+
+// Returns a CharacterIterator for token's text, advancing one past the last
+// matching character from the query term.
+CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
+ const std::string& match_query_term) {
+ switch (token.type) {
+ case Token::Type::VERBATIM: {
+ // VERBATIM tokens are not normalized. This means the non-normalized
+ // matched query term must be either equal to or a prefix of the token's
+ // text. Therefore, the match must end at the end of the matched query
+ // term.
+ CharacterIterator verbatim_match_end =
+ CharacterIterator(token.text, 0, 0, 0);
+ verbatim_match_end.AdvanceToUtf8(match_query_term.length());
+ return verbatim_match_end;
+ }
+ case Token::Type::QUERY_EXCLUSION:
+ [[fallthrough]];
+ case Token::Type::QUERY_LEFT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_OR:
+ [[fallthrough]];
+ case Token::Type::QUERY_PROPERTY:
+ [[fallthrough]];
+ case Token::Type::INVALID:
+ ICING_LOG(WARNING)
+ << "Unexpected Token type " << static_cast<int>(token.type)
+ << " found when finding match end of query term and token.";
+ [[fallthrough]];
+ case Token::Type::REGULAR:
+ return normalizer.FindNormalizedMatchEndPosition(token.text,
+ match_query_term);
+ }
+}
+
class TokenMatcher {
public:
virtual ~TokenMatcher() = default;
@@ -102,15 +164,16 @@ class TokenMatcherExact : public TokenMatcher {
normalizer_(normalizer) {}
CharacterIterator Matches(Token token) const override {
- std::string s = normalizer_.NormalizeTerm(token.text);
+ std::string s = NormalizeToken(normalizer_, token);
auto itr = unrestricted_query_terms_.find(s);
if (itr == unrestricted_query_terms_.end()) {
itr = restricted_query_terms_.find(s);
}
if (itr != unrestricted_query_terms_.end() &&
itr != restricted_query_terms_.end()) {
- return normalizer_.FindNormalizedMatchEndPosition(token.text, *itr);
+ return FindMatchEnd(normalizer_, token, *itr);
}
+
return CharacterIterator(token.text, -1, -1, -1);
}
@@ -131,19 +194,17 @@ class TokenMatcherPrefix : public TokenMatcher {
normalizer_(normalizer) {}
CharacterIterator Matches(Token token) const override {
- std::string s = normalizer_.NormalizeTerm(token.text);
+ std::string s = NormalizeToken(normalizer_, token);
for (const std::string& query_term : unrestricted_query_terms_) {
if (query_term.length() <= s.length() &&
s.compare(0, query_term.length(), query_term) == 0) {
- return normalizer_.FindNormalizedMatchEndPosition(token.text,
- query_term);
+ return FindMatchEnd(normalizer_, token, query_term);
}
}
for (const std::string& query_term : restricted_query_terms_) {
if (query_term.length() <= s.length() &&
s.compare(0, query_term.length(), query_term) == 0) {
- return normalizer_.FindNormalizedMatchEndPosition(token.text,
- query_term);
+ return FindMatchEnd(normalizer_, token, query_term);
}
}
return CharacterIterator(token.text, -1, -1, -1);
@@ -184,7 +245,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
std::string_view value, int window_start_min_exclusive_utf32,
Tokenizer::Iterator* iterator) {
- if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) {
+ if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
@@ -219,7 +280,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
std::string_view value, int window_end_max_exclusive_utf32,
Tokenizer::Iterator* iterator) {
- if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) {
+ if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
@@ -283,9 +344,9 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
int window_start_min_exclusive_utf32 =
- (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1;
+ (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1;
int window_end_max_exclusive_utf32 =
- match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2;
+ match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2;
snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
@@ -296,7 +357,7 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
// Only include windows if it'll at least include the matched text. Otherwise,
// it'll just be an empty string anyways.
- if (snippet_spec.max_window_bytes() >= match_len_utf32) {
+ if (snippet_spec.max_window_utf32_length() >= match_len_utf32) {
// Find the beginning of the window.
ICING_ASSIGN_OR_RETURN(
CharacterIterator window_start,
@@ -337,8 +398,13 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
// DetermineWindowStart/End may change the position of the iterator. So,
// reset the iterator back to the original position.
- bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1)
- : iterator->ResetToStart();
+ bool success = false;
+ if (match_pos_utf32 > 0) {
+ success = iterator->ResetToTokenStartingAfter(match_pos_utf32 - 1);
+ } else {
+ success = iterator->ResetToStart();
+ }
+
if (!success) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index f811941..0de2295 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,7 +22,6 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -37,6 +36,7 @@
#include "icing/store/key-mapper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
@@ -58,16 +58,18 @@ using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
- PropertyConfigProto_Cardinality_Code_REPEATED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
+ StringIndexingConfig::TokenizerType::VERBATIM;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
std::vector<std::string_view> paths;
@@ -131,7 +133,7 @@ class SnippetRetrieverTest : public testing::Test {
snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
snippet_spec_.set_num_matches_per_property(
std::numeric_limits<int32_t>::max());
- snippet_spec_.set_max_window_bytes(64);
+ snippet_spec_.set_max_window_utf32_length(64);
}
void TearDown() override {
@@ -178,7 +180,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
// Window starts at the beginning of "three" and ends in the middle of
// "three". len=4, orig_window= "thre"
- snippet_spec_.set_max_window_bytes(4);
+ snippet_spec_.set_max_window_utf32_length(4);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -204,7 +206,7 @@ TEST_F(SnippetRetrieverTest,
// Window starts at the beginning of "three" and at the exact end of
// "three". len=5, orig_window= "three"
- snippet_spec_.set_max_window_bytes(5);
+ snippet_spec_.set_max_window_utf32_length(5);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -230,7 +232,7 @@ TEST_F(SnippetRetrieverTest,
// Window starts at the beginning of "four" and at the exact end of
// "four". len=4, orig_window= "four"
- snippet_spec_.set_max_window_bytes(4);
+ snippet_spec_.set_max_window_utf32_length(4);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -262,7 +264,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
// 1. untrimmed, no-shifting window will be (2,17).
// 2. trimmed, no-shifting window [4,13) "two three"
// 3. trimmed, shifted window [4,18) "two three four"
- snippet_spec_.set_max_window_bytes(14);
+ snippet_spec_.set_max_window_utf32_length(14);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -295,7 +297,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
// 1. untrimmed, no-shifting window will be (1,18).
// 2. trimmed, no-shifting window [4,18) "two three four"
// 3. trimmed, shifted window [4,20) "two three four.."
- snippet_spec_.set_max_window_bytes(16);
+ snippet_spec_.set_max_window_utf32_length(16);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -321,7 +323,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
// Window ends in the middle of all the punctuation and window starts at 0.
// len=20, orig_window="one two three four.."
- snippet_spec_.set_max_window_bytes(20);
+ snippet_spec_.set_max_window_utf32_length(20);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -349,7 +351,7 @@ TEST_F(SnippetRetrieverTest,
// Window ends in the middle of all the punctuation and window starts at 0.
// len=26, orig_window="pside down in Australia¿"
- snippet_spec_.set_max_window_bytes(24);
+ snippet_spec_.set_max_window_utf32_length(24);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -377,7 +379,7 @@ TEST_F(SnippetRetrieverTest,
// Window ends in the middle of all the punctuation and window starts at 0.
// len=26, orig_window="upside down in Australia¿ "
- snippet_spec_.set_max_window_bytes(26);
+ snippet_spec_.set_max_window_utf32_length(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -410,7 +412,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
// 1. untrimmed, no-shifting window will be (-2,21).
// 2. trimmed, no-shifting window [0,21) "one two three four..."
// 3. trimmed, shifted window [0,22) "one two three four...."
- snippet_spec_.set_max_window_bytes(22);
+ snippet_spec_.set_max_window_utf32_length(22);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -436,7 +438,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
// Window ends before "five" but after all the punctuation
// len=26, orig_window="one two three four.... "
- snippet_spec_.set_max_window_bytes(26);
+ snippet_spec_.set_max_window_utf32_length(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -469,7 +471,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
// 1. untrimmed, no-shifting window will be ((-7,26).
// 2. trimmed, no-shifting window [0,26) "one two three four...."
// 3. trimmed, shifted window [0,27) "one two three four.... five"
- snippet_spec_.set_max_window_bytes(32);
+ snippet_spec_.set_max_window_utf32_length(32);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -495,7 +497,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
// Max window size equals the size of the value.
// len=34, orig_window="one two three four.... five"
- snippet_spec_.set_max_window_bytes(34);
+ snippet_spec_.set_max_window_utf32_length(34);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -521,7 +523,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
// Max window size exceeds the size of the value.
// len=36, orig_window="one two three four.... five"
- snippet_spec_.set_max_window_bytes(36);
+ snippet_spec_.set_max_window_utf32_length(36);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -555,7 +557,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
// 1. untrimmed, no-shifting window will be (-10,19).
// 2. trimmed, no-shifting window [0,19) "one two three four."
// 3. trimmed, shifted window [0,27) "one two three four.... five"
- snippet_spec_.set_max_window_bytes(28);
+ snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -589,7 +591,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
// 1. untrimmed, no-shifting window will be (10,39).
// 2. trimmed, no-shifting window [14,31) "four.... five six"
// 3. trimmed, shifted window [4,31) "two three four.... five six"
- snippet_spec_.set_max_window_bytes(28);
+ snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -623,7 +625,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
// 1. untrimmed, no-shifting window will be (-10,19).
// 2. trimmed, no-shifting window [0, 19) "one two three four."
// 3. trimmed, shifted window [0, 22) "one two three four...."
- snippet_spec_.set_max_window_bytes(28);
+ snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -657,7 +659,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
// 1. untrimmed, no-shifting window will be (1,30).
// 2. trimmed, no-shifting window [4, 22) "two three four...."
// 3. trimmed, shifted window [0, 22) "one two three four...."
- snippet_spec_.set_max_window_bytes(28);
+ snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
@@ -721,7 +723,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
.AddStringProperty("body", "Only a fool would match this content.")
.Build();
- snippet_spec_.set_max_window_bytes(0);
+ snippet_spec_.set_max_window_utf32_length(0);
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
@@ -1473,7 +1475,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
// 1. untrimmed, no-shifting window will be (0,7).
// 2. trimmed, no-shifting window [1, 6) "每天走路去".
// 3. trimmed, shifted window [0, 6) "我每天走路去"
- snippet_spec_.set_max_window_bytes(6);
+ snippet_spec_.set_max_window_utf32_length(6);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
@@ -1572,7 +1574,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
// UTF8 idx: 9 22
// UTF16 idx: 5 12
// UTF32 idx: 3 7
- snippet_spec_.set_max_window_bytes(6);
+ snippet_spec_.set_max_window_utf32_length(6);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
@@ -1596,6 +1598,117 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
}
+TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("verbatimType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("verbatim")
+ .SetDataTypeString(MATCH_EXACT,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "verbatim/1")
+ .SetSchema("verbatimType")
+ .AddStringProperty("verbatim", "Hello, world!")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000001;
+ SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}};
+
+ snippet_spec_.set_max_window_utf32_length(13);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ // There should only be one snippet entry and match, the verbatim token in its
+ // entirety.
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ ASSERT_THAT(entry->property_name(), "verbatim");
+
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+ // We expect the match to begin at position 0, and to span the entire token
+ // which contains 13 characters.
+ EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(13));
+
+ // We expect the submatch to begin at position 0 of the verbatim token and
+ // span the length of our query term "Hello, world!", which has utf-16 length
+ // of 13. The submatch length is equal to the window length as the query the
+ // snippet is retrieved with an exact term match.
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
+ EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13));
+}
+
+TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("verbatimType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("verbatim")
+ .SetDataTypeString(MATCH_PREFIX,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF8 idx: 0 3 9 15 18
+ // UTF16 idx: 0 1 3 5 6
+ // UTF32 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ std::string chinese_string = "我每天走路去上班。";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "verbatim/1")
+ .SetSchema("verbatimType")
+ .AddStringProperty("verbatim", chinese_string)
+ .Build();
+
+ SectionIdMask section_mask = 0b00000001;
+ SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}};
+
+ snippet_spec_.set_max_window_utf32_length(9);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // There should only be one snippet entry and match, the verbatim token in its
+ // entirety.
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ ASSERT_THAT(entry->property_name(), "verbatim");
+
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+ // We expect the match to begin at position 0, and to span the entire token
+ // which has utf-16 length of 9.
+ EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(9));
+
+ // We expect the submatch to begin at position 0 of the verbatim token and
+ // span the length of our query term "我每", which has utf-16 length of 2.
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
+ EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index 67528ab..acc5030 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -268,7 +268,7 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
schema_type_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete(
filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
@@ -464,11 +464,8 @@ libtextclassifier3::Status SchemaStore::PersistToDisk() {
SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
SchemaStoreStorageInfoProto storage_info;
int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str());
- if (directory_size != Filesystem::kBadFileSize) {
- storage_info.set_schema_store_size(directory_size);
- } else {
- storage_info.set_schema_store_size(-1);
- }
+ storage_info.set_schema_store_size(
+ Filesystem::SanitizeFileSize(directory_size));
ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
storage_info.set_num_schema_types(schema->types_size());
int total_sections = 0;
@@ -496,5 +493,17 @@ SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
return section_manager_->GetMetadataList(schema_type);
}
+libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
+ const {
+ SchemaDebugInfoProto debug_info;
+ if (has_schema_successfully_set_) {
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
+ *debug_info.mutable_schema() = *schema;
+ }
+ ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
+ debug_info.set_crc(crc.Get());
+ return debug_info;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h
index 6b6528d..2d3aca7 100644
--- a/icing/schema/schema-store.h
+++ b/icing/schema/schema-store.h
@@ -26,6 +26,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
@@ -137,9 +138,7 @@ class SchemaStore {
// Persists and updates checksum of subcomponents.
~SchemaStore();
- // Retrieve the current schema if it exists. Caller does not get ownership of
- // the schema proto and modifying the returned pointer does not affect the
- // underlying schema proto.
+ // Retrieve the current schema if it exists.
//
// Returns:
// SchemaProto* if exists
@@ -258,6 +257,13 @@ class SchemaStore {
// that field will be set to -1.
SchemaStoreStorageInfoProto GetStorageInfo() const;
+ // Get debug information for the schema store.
+ //
+ // Returns:
+ // SchemaDebugInfoProto on success
+ // INTERNAL_ERROR on IO errors, crc compute error
+ libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const;
+
private:
// Use SchemaStore::Create instead.
explicit SchemaStore(const Filesystem* filesystem, std::string base_dir,
diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc
index be7170f..113084e 100644
--- a/icing/schema/schema-store_test.cc
+++ b/icing/schema/schema-store_test.cc
@@ -44,23 +44,24 @@ using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::Ge;
+using ::testing::Gt;
using ::testing::Not;
using ::testing::Pointee;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
- PropertyConfigProto_Cardinality_Code_REPEATED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
- PropertyConfigProto_DataType_Code_STRING;
-constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE =
- PropertyConfigProto_DataType_Code_DOUBLE;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE =
+ PropertyConfigProto::DataType::DOUBLE;
class SchemaStoreTest : public ::testing::Test {
protected:
@@ -868,6 +869,38 @@ TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) {
EXPECT_THAT(storage_info.num_schema_types_sections_exhausted(), Eq(1));
}
+TEST_F(SchemaStoreTest, GetDebugInfo) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+
+ // Set schema
+ ASSERT_THAT(
+ schema_store->SetSchema(schema_),
+ IsOkAndHolds(EqualsSetSchemaResult(SchemaStore::SetSchemaResult{
+ .success = true,
+ .schema_types_new_by_name = {schema_.types(0).schema_type()}})));
+
+ // Check debug info
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out,
+ schema_store->GetDebugInfo());
+ EXPECT_THAT(out.schema(), EqualsProto(schema_));
+ EXPECT_THAT(out.crc(), Gt(0));
+}
+
+TEST_F(SchemaStoreTest, GetDebugInfoForEmptySchemaStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+
+ // Check debug info before setting a schema
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out,
+ schema_store->GetDebugInfo());
+ SchemaDebugInfoProto expected_out;
+ expected_out.set_crc(0);
+ EXPECT_THAT(out, EqualsProto(expected_out));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
index 26ef4c7..f28a2f8 100644
--- a/icing/schema/schema-util_test.cc
+++ b/icing/schema/schema-util_test.cc
@@ -38,32 +38,32 @@ constexpr char kEmailType[] = "EmailMessage";
constexpr char kMessageType[] = "Text";
constexpr char kPersonType[] = "Person";
-constexpr PropertyConfigProto_DataType_Code TYPE_DOCUMENT =
- PropertyConfigProto_DataType_Code_DOCUMENT;
-constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
- PropertyConfigProto_DataType_Code_STRING;
-constexpr PropertyConfigProto_DataType_Code TYPE_INT =
- PropertyConfigProto_DataType_Code_INT64;
-constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE =
- PropertyConfigProto_DataType_Code_DOUBLE;
-
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_UNKNOWN =
- PropertyConfigProto_Cardinality_Code_UNKNOWN;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
- PropertyConfigProto_Cardinality_Code_REQUIRED;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
- PropertyConfigProto_Cardinality_Code_REPEATED;
-
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE =
- StringIndexingConfig_TokenizerType_Code_NONE;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
-
-constexpr TermMatchType_Code MATCH_UNKNOWN = TermMatchType_Code_UNKNOWN;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT =
+ PropertyConfigProto::DataType::DOCUMENT;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_INT =
+ PropertyConfigProto::DataType::INT64;
+constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE =
+ PropertyConfigProto::DataType::DOUBLE;
+
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN =
+ PropertyConfigProto::Cardinality::UNKNOWN;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
+ PropertyConfigProto::Cardinality::REQUIRED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
+
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE =
+ StringIndexingConfig::TokenizerType::NONE;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
+
+constexpr TermMatchType::Code MATCH_UNKNOWN = TermMatchType::UNKNOWN;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) {
// Create a schema with the following dependencies:
diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc
index f22a31a..fef612d 100644
--- a/icing/scoring/scorer_test.cc
+++ b/icing/scoring/scorer_test.cc
@@ -40,11 +40,11 @@ namespace lib {
namespace {
using ::testing::Eq;
-constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
- PropertyConfigProto_DataType_Code_STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
- PropertyConfigProto_Cardinality_Code_REQUIRED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
+ PropertyConfigProto::Cardinality::REQUIRED;
class ScorerTest : public testing::Test {
protected:
diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc
index 7e5cb0f..f169039 100644
--- a/icing/scoring/scoring-processor_test.cc
+++ b/icing/scoring/scoring-processor_test.cc
@@ -34,14 +34,16 @@ namespace lib {
namespace {
using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Gt;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
- PropertyConfigProto_DataType_Code_STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
class ScoringProcessorTest : public testing::Test {
protected:
@@ -789,6 +791,77 @@ TEST_F(ScoringProcessorTest,
ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit)));
}
+TEST_F(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_WithZeroPropertyWeight) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store()->Put(document2, /*num_tokens=*/1));
+
+ // Document 1 contains the term "foo" 1 time in the "body" property
+ SectionId body_section_id = 0;
+ DocHitInfo doc_hit_info1(document_id1);
+ doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
+
+ // Document 2 contains the term "foo" 1 time in the "subject" property
+ SectionId subject_section_id = 1;
+ DocHitInfo doc_hit_info2(document_id2);
+ doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1);
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2};
+
+ // Creates a dummy DocHitInfoIterator with 2 results for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto;
+ spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+
+ // Sets property weight for "body" to 0.0.
+ PropertyWeight body_property_weight =
+ CreatePropertyWeight(/*path=*/"body", /*weight=*/0.0);
+ // Sets property weight for "subject" to 1.0.
+ PropertyWeight subject_property_weight =
+ CreatePropertyWeight(/*path=*/"subject", /*weight=*/1.0);
+ *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
+ /*schema_type=*/"email", {body_property_weight, subject_property_weight});
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/2, &query_term_iterators);
+
+ // We expect document1 to have a score of 0.0 as the query term "foo" matches
+ // in the "body" property which has a weight of 0.0. This is a result of the
+ // weighted term frequency being scaled down to 0.0 for the hit. We expect
+ // document2 to have a positive score as the query term "foo" matches in the
+ // "subject" property which has a weight of 1.0.
+ EXPECT_THAT(scored_document_hits, SizeIs(2));
+ EXPECT_THAT(scored_document_hits.at(0).document_id(), Eq(document_id1));
+ EXPECT_THAT(scored_document_hits.at(0).score(), Eq(0.0));
+ EXPECT_THAT(scored_document_hits.at(1).document_id(), Eq(document_id2));
+ EXPECT_THAT(scored_document_hits.at(1).score(), Gt(0.0));
+}
+
TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) {
DocumentProto document1 =
CreateDocument("icing", "email/1", kDefaultScore,
diff --git a/icing/scoring/section-weights.cc b/icing/scoring/section-weights.cc
index c4afe7f..ed7cd5e 100644
--- a/icing/scoring/section-weights.cc
+++ b/icing/scoring/section-weights.cc
@@ -27,10 +27,14 @@ namespace lib {
namespace {
-// Normalizes all weights in the map to be in range (0.0, 1.0], where the max
-// weight is normalized to 1.0.
+// Normalizes all weights in the map to be in range [0.0, 1.0], where the max
+// weight is normalized to 1.0. In the case that all weights are equal to 0.0,
+// the normalized weight for each will be 0.0.
inline void NormalizeSectionWeights(
double max_weight, std::unordered_map<SectionId, double>& section_weights) {
+ if (max_weight == 0.0) {
+ return;
+ }
for (auto& raw_weight : section_weights) {
raw_weight.second = raw_weight.second / max_weight;
}
@@ -70,11 +74,11 @@ SectionWeights::Create(const SchemaStore* schema_store,
type_property_weights.property_weights()) {
double property_path_weight = property_weight.weight();
- // Return error on negative and zero weights.
- if (property_path_weight <= 0.0) {
+ // Return error on negative weights.
+ if (property_path_weight < 0.0) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Property weight for property path \"%s\" is negative or zero. "
- "Negative and zero weights are invalid.",
+ "Property weight for property path \"%s\" is negative. Negative "
+ "weights are invalid.",
property_weight.path().c_str()));
}
property_paths_weights.insert(
@@ -116,7 +120,7 @@ inline SectionWeights::NormalizedSectionWeights
SectionWeights::ExtractNormalizedSectionWeights(
const std::unordered_map<std::string, double>& raw_weights,
const std::vector<SectionMetadata>& metadata_list) {
- double max_weight = 0.0;
+ double max_weight = -std::numeric_limits<double>::infinity();
std::unordered_map<SectionId, double> section_weights;
for (const SectionMetadata& section_metadata : metadata_list) {
std::string_view metadata_path = section_metadata.path;
@@ -132,10 +136,11 @@ SectionWeights::ExtractNormalizedSectionWeights(
NormalizeSectionWeights(max_weight, section_weights);
// Set normalized default weight to 1.0 in case there is no section
- // metadata and max_weight is 0.0 (we should not see this case).
- double normalized_default_weight = max_weight == 0.0
- ? kDefaultSectionWeight
- : kDefaultSectionWeight / max_weight;
+ // metadata and max_weight is -INF (we should not see this case).
+ double normalized_default_weight =
+ max_weight == -std::numeric_limits<double>::infinity()
+ ? kDefaultSectionWeight
+ : kDefaultSectionWeight / max_weight;
SectionWeights::NormalizedSectionWeights normalized_section_weights =
SectionWeights::NormalizedSectionWeights();
normalized_section_weights.section_weights = std::move(section_weights);
diff --git a/icing/scoring/section-weights_test.cc b/icing/scoring/section-weights_test.cc
index b90c3d5..330faee 100644
--- a/icing/scoring/section-weights_test.cc
+++ b/icing/scoring/section-weights_test.cc
@@ -48,13 +48,13 @@ class SectionWeightsTest : public testing::Test {
SchemaTypeConfigProto sender_schema =
SchemaTypeConfigBuilder()
.SetType("sender")
- .AddProperty(PropertyConfigBuilder()
- .SetName("name")
- .SetDataTypeString(
- TermMatchType::PREFIX,
- StringIndexingConfig::TokenizerType::PLAIN)
- .SetCardinality(
- PropertyConfigProto_Cardinality_Code_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
.Build();
SchemaTypeConfigProto email_schema =
SchemaTypeConfigBuilder()
@@ -65,24 +65,22 @@ class SectionWeightsTest : public testing::Test {
.SetDataTypeString(
TermMatchType::PREFIX,
StringIndexingConfig::TokenizerType::PLAIN)
- .SetDataType(PropertyConfigProto_DataType_Code_STRING)
- .SetCardinality(
- PropertyConfigProto_Cardinality_Code_OPTIONAL))
+ .SetDataType(PropertyConfigProto::DataType::STRING)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
.AddProperty(
PropertyConfigBuilder()
.SetName("body")
.SetDataTypeString(
TermMatchType::PREFIX,
StringIndexingConfig::TokenizerType::PLAIN)
- .SetDataType(PropertyConfigProto_DataType_Code_STRING)
- .SetCardinality(
- PropertyConfigProto_Cardinality_Code_OPTIONAL))
- .AddProperty(PropertyConfigBuilder()
- .SetName("sender")
- .SetDataTypeDocument(
- "sender", /*index_nested_properties=*/true)
- .SetCardinality(
- PropertyConfigProto_Cardinality_Code_OPTIONAL))
+ .SetDataType(PropertyConfigProto::DataType::STRING)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument("sender",
+ /*index_nested_properties=*/true)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
.Build();
SchemaProto schema =
SchemaBuilder().AddType(sender_schema).AddType(email_schema).Build();
@@ -171,20 +169,79 @@ TEST_F(SectionWeightsTest, ShouldFailWithNegativeWeights) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SectionWeightsTest, ShouldFailWithZeroWeight) {
+TEST_F(SectionWeightsTest, ShouldAcceptZeroWeight) {
ScoringSpecProto spec_proto;
TypePropertyWeights *type_property_weights =
spec_proto.add_type_property_weights();
- type_property_weights->set_schema_type("sender");
+ type_property_weights->set_schema_type("email");
- PropertyWeight *property_weight =
+ PropertyWeight *body_property_weight =
type_property_weights->add_property_weights();
- property_weight->set_weight(0.0);
- property_weight->set_path("name");
+ body_property_weight->set_weight(2.0);
+ body_property_weight->set_path("body");
- EXPECT_THAT(SectionWeights::Create(schema_store(), spec_proto).status(),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(0.0);
+ subject_property_weight->set_path("subject");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(1.0));
+ // Normalized weight for "subject" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(0.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldNormalizeToZeroWhenAllWeightsZero) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ type_property_weights->add_property_weights();
+ body_property_weight->set_weight(0.0);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *sender_property_weight =
+ type_property_weights->add_property_weights();
+ sender_property_weight->set_weight(0.0);
+ sender_property_weight->set_path("sender.name");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(0.0);
+ subject_property_weight->set_path("subject");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(0.0));
+ // Normalized weight for "sender.name" property (the nested property).
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(0.0));
+ // Normalized weight for "subject" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(0.0));
}
TEST_F(SectionWeightsTest, ShouldReturnDefaultIfTypePropertyWeightsNotSet) {
diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc
index 5e0426e..5e23a8e 100644
--- a/icing/store/document-log-creator.cc
+++ b/icing/store/document-log-creator.cc
@@ -72,19 +72,20 @@ DocumentLogCreator::Create(const Filesystem* filesystem,
bool v1_exists =
filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str());
- bool regen_derived_files = false;
+ bool new_file = false;
+ int preexisting_file_version = kCurrentVersion;
if (v0_exists && !v1_exists) {
ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir));
// Need to regenerate derived files since documents may be written to a
// different file offset in the log.
- regen_derived_files = true;
+ preexisting_file_version = 0;
} else if (!v1_exists) {
// First time initializing a v1 log. There are no existing derived files at
// this point, so we should generate some. "regenerate" here also means
// "generate for the first time", i.e. we shouldn't expect there to be any
// existing derived files.
- regen_derived_files = true;
+ new_file = true;
}
ICING_ASSIGN_OR_RETURN(
@@ -96,7 +97,7 @@ DocumentLogCreator::Create(const Filesystem* filesystem,
/*compress_in=*/true)));
CreateResult create_result = {std::move(log_create_result),
- regen_derived_files};
+ preexisting_file_version, new_file};
return create_result;
}
diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h
index 51cf497..be8feed 100644
--- a/icing/store/document-log-creator.h
+++ b/icing/store/document-log-creator.h
@@ -30,14 +30,20 @@ namespace lib {
// be necessary.
class DocumentLogCreator {
public:
+ // Version 0 refers to FileBackedProtoLog
+ // Version 1 refers to PortableFileBackedProtoLog with kFileFormatVersion = 0
+ static constexpr int32_t kCurrentVersion = 1;
struct CreateResult {
// The create result passed up from the PortableFileBackedProtoLog::Create.
// Contains the document log.
PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result;
- // Whether the caller needs to also regenerate/generate any derived files
- // based off of the initialized document log.
- bool regen_derived_files;
+ // The version number of the pre-existing document log file.
+ // If there is no document log file, it will be set to kCurrentVersion.
+ int preexisting_file_version;
+
+ // Whether the created file is new.
+ bool new_file;
};
// Creates the document log in the base_dir. Will create one if it doesn't
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 226a96b..8c8369c 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -164,6 +164,32 @@ int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
return expiration_timestamp_ms;
}
+InitializeStatsProto::RecoveryCause GetRecoveryCause(
+ const DocumentLogCreator::CreateResult& create_result,
+ bool force_recovery_and_revalidate_documents) {
+ if (force_recovery_and_revalidate_documents) {
+ return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
+ } else if (create_result.log_create_result.has_data_loss()) {
+ return InitializeStatsProto::DATA_LOSS;
+ } else if (create_result.preexisting_file_version !=
+ DocumentLogCreator::kCurrentVersion) {
+ return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
+ }
+ return InitializeStatsProto::NONE;
+}
+
+InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
+ DataLoss data_loss) {
+ switch (data_loss) {
+ case DataLoss::PARTIAL:
+ return InitializeStatsProto::PARTIAL_LOSS;
+ case DataLoss::COMPLETE:
+ return InitializeStatsProto::COMPLETE_LOSS;
+ case DataLoss::NONE:
+ return InitializeStatsProto::NO_DATA_LOSS;
+ }
+}
+
} // namespace
DocumentStore::DocumentStore(const Filesystem* filesystem,
@@ -236,44 +262,34 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
std::move(create_result_or).ValueOrDie();
document_log_ = std::move(create_result.log_create_result.proto_log);
-
- if (create_result.regen_derived_files ||
- force_recovery_and_revalidate_documents ||
- create_result.log_create_result.has_data_loss()) {
+ InitializeStatsProto::RecoveryCause recovery_cause =
+ GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
+
+ if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
+ ICING_LOG(WARNING) << "Starting Document Store Recovery with cause="
+ << recovery_cause << ", and create result { new_file="
+ << create_result.new_file << ", preeisting_file_version="
+ << create_result.preexisting_file_version << ", data_loss="
+ << create_result.log_create_result.data_loss << "} and kCurrentVersion="
+ << DocumentLogCreator::kCurrentVersion;
// We can't rely on any existing derived files. Recreate them from scratch.
// Currently happens if:
// 1) This is a new log and we don't have derived files yet
// 2) Client wanted us to force a regeneration.
// 3) Log has some data loss, can't rely on existing derived data.
- if (create_result.log_create_result.has_data_loss() &&
- initialize_stats != nullptr) {
- ICING_LOG(WARNING)
- << "Data loss in document log, regenerating derived files.";
- initialize_stats->set_document_store_recovery_cause(
- InitializeStatsProto::DATA_LOSS);
-
- if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) {
- // Ground truth is partially lost.
- initialize_stats->set_document_store_data_status(
- InitializeStatsProto::PARTIAL_LOSS);
- } else {
- // Ground truth is completely lost.
- initialize_stats->set_document_store_data_status(
- InitializeStatsProto::COMPLETE_LOSS);
- }
- }
-
std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
libtextclassifier3::Status status =
RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
if (initialize_stats != nullptr &&
- (force_recovery_and_revalidate_documents ||
- create_result.log_create_result.has_data_loss())) {
+ recovery_cause != InitializeStatsProto::NONE) {
// Only consider it a recovery if the client forced a recovery or there
// was data loss. Otherwise, this could just be the first time we're
// initializing and generating derived files.
initialize_stats->set_document_store_recovery_latency_ms(
document_recovery_timer->GetElapsedMilliseconds());
+ initialize_stats->set_document_store_recovery_cause(recovery_cause);
+ initialize_stats->set_document_store_data_status(
+ GetDataStatus(create_result.log_create_result.data_loss));
}
if (!status.ok()) {
ICING_LOG(ERROR)
@@ -282,13 +298,13 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
}
} else {
if (!InitializeExistingDerivedFiles().ok()) {
- ICING_VLOG(1)
+ ICING_LOG(WARNING)
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for DocumentStore.";
std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
libtextclassifier3::Status status = RegenerateDerivedFiles(
- /*force_recovery_and_revalidate_documents*/ false);
- if (initialize_stats != nullptr && num_documents() > 0) {
+ /*force_recovery_and_revalidate_documents=*/false);
+ if (initialize_stats != nullptr) {
initialize_stats->set_document_store_recovery_cause(
InitializeStatsProto::IO_ERROR);
initialize_stats->set_document_store_recovery_latency_ms(
@@ -415,7 +431,19 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
// Iterates through document log
auto iterator = document_log_->GetIterator();
auto iterator_status = iterator.Advance();
+ libtextclassifier3::StatusOr<int64_t> element_size =
+ document_log_->GetElementsFileSize();
+ libtextclassifier3::StatusOr<int64_t> disk_usage =
+ document_log_->GetDiskUsage();
+ if (element_size.ok() && disk_usage.ok()) {
+ ICING_VLOG(1) << "Starting recovery of document store. Document store "
+ "elements file size:"
+ << element_size.ValueOrDie()
+ << ", disk usage=" << disk_usage.ValueOrDie();
+ }
while (iterator_status.ok()) {
+ ICING_VLOG(2) << "Attempting to read document at offset="
+ << iterator.GetOffset();
libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
document_log_->ReadProto(iterator.GetOffset());
@@ -530,7 +558,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
document_key_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status =
KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
@@ -540,7 +568,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
return status;
}
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
auto document_key_mapper_or =
KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
@@ -556,7 +584,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
document_id_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
*filesystem_, MakeDocumentIdMapperFilename(base_dir_));
@@ -565,7 +593,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
<< "Failed to delete old document_id mapper";
return status;
}
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
*filesystem_, MakeDocumentIdMapperFilename(base_dir_),
@@ -618,7 +646,7 @@ libtextclassifier3::Status DocumentStore::ResetFilterCache() {
libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
namespace_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete(
*filesystem_, MakeNamespaceMapperFilename(base_dir_));
@@ -638,7 +666,7 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
corpus_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete(
*filesystem_, MakeCorpusMapperFilename(base_dir_));
@@ -1749,5 +1777,63 @@ libtextclassifier3::Status DocumentStore::SetUsageScores(
return usage_store_->SetUsageScores(document_id, usage_scores);
}
+libtextclassifier3::StatusOr<
+ google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
+DocumentStore::CollectCorpusInfo() const {
+ google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
+ corpus_info;
+ libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
+ schema_store_->GetSchema();
+ if (!schema_proto_or.ok()) {
+ return corpus_info;
+ }
+ // Maps from CorpusId to the corresponding protocol buffer in the result.
+ std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
+ std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
+ namespace_mapper_->GetValuesToKeys();
+ const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
+ for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
+ ++document_id) {
+ if (!InternalDoesDocumentExist(document_id)) {
+ continue;
+ }
+ ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
+ filter_cache_->Get(document_id));
+ ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
+ score_cache_->Get(document_id));
+ const std::string& name_space =
+ namespace_id_to_namespace[filter_data->namespace_id()];
+ const std::string& schema =
+ schema_proto->types()[filter_data->schema_type_id()].schema_type();
+ auto iter = info_map.find(score_data->corpus_id());
+ if (iter == info_map.end()) {
+ DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
+ entry->set_namespace_(name_space);
+ entry->set_schema(schema);
+ iter = info_map.insert({score_data->corpus_id(), entry}).first;
+ }
+ iter->second->set_total_documents(iter->second->total_documents() + 1);
+ iter->second->set_total_token(iter->second->total_token() +
+ score_data->length_in_tokens());
+ }
+ return corpus_info;
+}
+
+libtextclassifier3::StatusOr<DocumentDebugInfoProto>
+DocumentStore::GetDebugInfo(int verbosity) const {
+ DocumentDebugInfoProto debug_info;
+ *debug_info.mutable_document_storage_info() = GetStorageInfo();
+ ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
+ debug_info.set_crc(crc.Get());
+ if (verbosity > 0) {
+ ICING_ASSIGN_OR_RETURN(google::protobuf::RepeatedPtrField<
+ DocumentDebugInfoProto::CorpusInfo>
+ corpus_info,
+ CollectCorpusInfo());
+ *debug_info.mutable_corpus_info() = std::move(corpus_info);
+ }
+ return debug_info;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index c85c989..e6d2e5c 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -27,6 +27,7 @@
#include "icing/file/file-backed-vector.h"
#include "icing/file/filesystem.h"
#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/document_wrapper.pb.h"
#include "icing/proto/logging.pb.h"
@@ -422,6 +423,17 @@ class DocumentStore {
// INTERNAL_ERROR on compute error
libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
+ // Get debug information for the document store.
+ // verbosity <= 0, simplest debug information
+ // verbosity > 0, also return the total number of documents and tokens in each
+ // (namespace, schema type) pair.
+ //
+ // Returns:
+ // DocumentDebugInfoProto on success
+ // INTERNAL_ERROR on IO errors, crc compute error
+ libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
+ int verbosity) const;
+
private:
// Use DocumentStore::Create() to instantiate.
DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
@@ -696,6 +708,13 @@ class DocumentStore {
// the document_id_mapper somehow became larger than the filter cache.
DocumentStorageInfoProto CalculateDocumentStatusCounts(
DocumentStorageInfoProto storage_info) const;
+
+ // Returns:
+ // - on success, a RepeatedPtrField for CorpusInfo collected.
+ // - OUT_OF_RANGE, this should never happen.
+ libtextclassifier3::StatusOr<google::protobuf::RepeatedPtrField<
+ DocumentDebugInfoProto::CorpusInfo>>
+ CollectCorpusInfo() const;
};
} // namespace lib
diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc
index 77da928..fc3fd9d 100644
--- a/icing/store/document-store_benchmark.cc
+++ b/icing/store/document-store_benchmark.cc
@@ -64,13 +64,13 @@ namespace lib {
namespace {
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
class DestructibleDirectory {
public:
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index a506eea..a30b4e4 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -29,7 +29,6 @@
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -45,6 +44,7 @@
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -85,16 +85,16 @@ const NamespaceStorageInfoProto& GetNamespaceStorageInfo(
return std::move(NamespaceStorageInfoProto());
}
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
- StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
-constexpr PropertyConfigProto_DataType_Code TYPE_INT =
- PropertyConfigProto_DataType_Code_INT64;
+constexpr PropertyConfigProto::DataType::Code TYPE_INT =
+ PropertyConfigProto::DataType::INT64;
UsageReport CreateUsageReport(std::string name_space, std::string uri,
int64 timestamp_ms,
@@ -3170,15 +3170,6 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) {
ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
}
-// TODO(b/185845269) Re-enable this test by copying over a full valid set of
-// document store files. Right now this test only includes the score_cache and
-// the document store header.
-//
-// This causes a problem now because this cl changes behavior to not consider an
-// InitializeExistingDerivedFiles failure to be a recovery if there is nothing
-// to recover because the doocument store is empty.
-#define DISABLE_BACKWARDS_COMPAT_TEST
-#ifndef DISABLE_BACKWARDS_COMPAT_TEST
TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
// The directory testdata/score_cache_without_length_in_tokens/document_store
// contains only the scoring_cache and the document_store_header (holding the
@@ -3194,29 +3185,26 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
// Get src files
std::string document_store_without_length_in_tokens;
- if (IsAndroidPlatform() || IsIosPlatform()) {
+ if (IsAndroidArm() || IsIosPlatform()) {
document_store_without_length_in_tokens = GetTestFilePath(
"icing/testdata/score_cache_without_length_in_tokens/"
"document_store_android_ios_compatible");
+ } else if (IsAndroidX86()) {
+ document_store_without_length_in_tokens = GetTestFilePath(
+ "icing/testdata/score_cache_without_length_in_tokens/"
+ "document_store_android_x86");
} else {
document_store_without_length_in_tokens = GetTestFilePath(
"icing/testdata/score_cache_without_length_in_tokens/"
"document_store");
}
- std::vector<std::string> document_store_files;
Filesystem filesystem;
- filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(),
- &document_store_files);
-
- ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens
- << ' ' << document_store_files.size();
- for (size_t i = 0; i != document_store_files.size(); i++) {
- std::string src = absl_ports::StrCat(
- document_store_without_length_in_tokens, "/", document_store_files[i]);
- std::string dst =
- absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]);
- ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true);
- }
+ ICING_LOG(INFO) << "Copying files "
+ << document_store_without_length_in_tokens;
+ ASSERT_THAT(
+ filesystem.CopyDirectory(document_store_without_length_in_tokens.c_str(),
+ document_store_dir_.c_str(), /*recursive=*/true),
+ true);
InitializeStatsProto initialize_stats;
ICING_ASSERT_OK_AND_ASSIGN(
@@ -3227,12 +3215,11 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
&initialize_stats));
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- // The store_cache trigger regeneration because its element size is
- // inconsistent: expected 20 (current new size), actual 12 (as per the v0
- // score_cache).
- EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause());
+ // The document log is using the legacy v0 format so that a migration is
+ // needed, which will also trigger regeneration.
+ EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
+ InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
}
-#endif // DISABLE_BACKWARDS_COMPAT_TEST
TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) {
ICING_ASSERT_OK_AND_ASSIGN(
@@ -3422,18 +3409,22 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) {
{
// Create the document store the second time and force recovery
+ InitializeStatsProto initialize_stats;
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
- DocumentStore::Create(
- &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
- /*force_recovery_and_revalidate_documents=*/true));
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/true,
+ &initialize_stats));
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
// Ensure that the type id of the email document has been correctly updated.
ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data,
doc_store->GetDocumentFilterData(docid));
- ASSERT_THAT(filter_data.schema_type_id(), Eq(1));
+ EXPECT_THAT(filter_data.schema_type_id(), Eq(1));
+ EXPECT_THAT(initialize_stats.document_store_recovery_cause(),
+ Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
}
}
@@ -3841,7 +3832,8 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
// Check that we didn't lose anything. A migration also doesn't technically
// count as a recovery.
EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
- EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause());
+ EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
+ InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
// Document 1 and 3 were put normally, and document 2 was deleted in our
// testdata files.
@@ -3864,6 +3856,164 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
}
#endif // DISABLE_BACKWARDS_COMPAT_TEST
+TEST_F(DocumentStoreTest, GetDebugInfo) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa bb cc")
+ .AddStringProperty("body", "dd ee")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document1, 5));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "email/2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa bb")
+ .AddStringProperty("body", "cc")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document2, 3));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "email/3")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa")
+ .AddStringProperty("body", "")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document3, 1));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace1", "person/1")
+ .SetSchema("person")
+ .AddStringProperty("name", "test test")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document4, 2));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out1,
+ document_store->GetDebugInfo(/*verbosity=*/1));
+ EXPECT_THAT(out1.crc(), Gt(0));
+ EXPECT_THAT(out1.document_storage_info().num_alive_documents(), Eq(4));
+ EXPECT_THAT(out1.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out1.document_storage_info().num_expired_documents(), Eq(0));
+
+ DocumentDebugInfoProto::CorpusInfo info1, info2, info3;
+ info1.set_namespace_("namespace1");
+ info1.set_schema("email");
+ info1.set_total_documents(1); // document1
+ info1.set_total_token(5);
+
+ info2.set_namespace_("namespace2");
+ info2.set_schema("email");
+ info2.set_total_documents(2); // document2 and document3
+ info2.set_total_token(4); // 3 + 1
+
+ info3.set_namespace_("namespace1");
+ info3.set_schema("person");
+ info3.set_total_documents(1); // document4
+ info3.set_total_token(2);
+
+ EXPECT_THAT(out1.corpus_info(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
+ EqualsProto(info3)));
+
+ // Delete document3.
+ ICING_ASSERT_OK(document_store->Delete("namespace2", "email/3"));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out2,
+ document_store->GetDebugInfo(/*verbosity=*/1));
+ EXPECT_THAT(out2.crc(), Gt(0));
+ EXPECT_THAT(out2.crc(), Not(Eq(out1.crc())));
+ EXPECT_THAT(out2.document_storage_info().num_alive_documents(), Eq(3));
+ EXPECT_THAT(out2.document_storage_info().num_deleted_documents(), Eq(1));
+ EXPECT_THAT(out2.document_storage_info().num_expired_documents(), Eq(0));
+ info2.set_total_documents(1); // document2
+ info2.set_total_token(3);
+ EXPECT_THAT(out2.corpus_info(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
+ EqualsProto(info3)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out3,
+ document_store->GetDebugInfo(/*verbosity=*/0));
+ EXPECT_THAT(out3.corpus_info(), IsEmpty());
+}
+
+TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) {
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out,
+ document_store->GetDebugInfo(/*verbosity=*/1));
+ EXPECT_THAT(out.crc(), Gt(0));
+ EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
+ EXPECT_THAT(out.corpus_info(), IsEmpty());
+}
+
+TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out,
+ document_store->GetDebugInfo(/*verbosity=*/1));
+ EXPECT_THAT(out.crc(), Gt(0));
+ EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
+ EXPECT_THAT(out.corpus_info(), IsEmpty());
+}
+
} // namespace
} // namespace lib
diff --git a/icing/store/namespace-checker-impl.h b/icing/store/namespace-checker-impl.h
new file mode 100644
index 0000000..bcd0643
--- /dev/null
+++ b/icing/store/namespace-checker-impl.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_NAMESPACE_CHECKER_IMPL_H_
+#define ICING_STORE_NAMESPACE_CHECKER_IMPL_H_
+
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/store/namespace-checker.h"
+#include "icing/store/namespace-id.h"
+
+namespace icing {
+namespace lib {
+
+class NamespaceCheckerImpl : public NamespaceChecker {
+ public:
+ explicit NamespaceCheckerImpl(
+ const DocumentStore* document_store,
+ std::unordered_set<NamespaceId> target_namespace_ids)
+ : document_store_(*document_store),
+ target_namespace_ids_(std::move(target_namespace_ids)) {}
+
+ bool BelongsToTargetNamespaces(DocumentId document_id) const override {
+ if (target_namespace_ids_.empty()) {
+ return true;
+ }
+ auto document_filter_data_or_ =
+ document_store_.GetDocumentFilterData(document_id);
+ return document_filter_data_or_.ok() &&
+ target_namespace_ids_.count(
+ document_filter_data_or_.ValueOrDie().namespace_id())> 0;
+ }
+ const DocumentStore& document_store_;
+ std::unordered_set<NamespaceId> target_namespace_ids_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_NAMESPACE_CHECKER_IMPL_H_ \ No newline at end of file
diff --git a/icing/store/namespace-checker.h b/icing/store/namespace-checker.h
new file mode 100644
index 0000000..8812ab1
--- /dev/null
+++ b/icing/store/namespace-checker.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_NAMESPACE_CHECKER_H_
+#define ICING_STORE_NAMESPACE_CHECKER_H_
+
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+class NamespaceChecker {
+ public:
+ virtual ~NamespaceChecker() = default;
+
+ // Check whether the given document id is belongs to the target namespaces.
+ // Returns:
+ // On success,
+ // - true: the given document id belongs to the target namespaces
+ // - false: the given document id doesn't belong to the target namespaces
+ // OUT_OF_RANGE if document_id is negative or exceeds previously seen
+ // DocumentIds
+ // NOT_FOUND if the document or the filter data is not found
+ // INTERNAL_ERROR on all other errors
+ virtual bool BelongsToTargetNamespaces(DocumentId document_id) const = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_NAMESPACE_CHECKER_H_
diff --git a/icing/testing/always-true-namespace-checker-impl.h b/icing/testing/always-true-namespace-checker-impl.h
new file mode 100644
index 0000000..f7744b6
--- /dev/null
+++ b/icing/testing/always-true-namespace-checker-impl.h
@@ -0,0 +1,34 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_
+#define ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_
+
+#include "icing/store/document-id.h"
+#include "icing/store/namespace-checker.h"
+
+namespace icing {
+namespace lib {
+
+class AlwaysTrueNamespaceCheckerImpl : public NamespaceChecker {
+ public:
+ bool BelongsToTargetNamespaces(DocumentId document_id) const override {
+ return true;
+ }
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_ALWAYS_TRUE_NAMESPACE_CHECKER_IMPL_H_ \ No newline at end of file
diff --git a/icing/helpers/icu/icu-data-file-helper.cc b/icing/testing/icu-data-file-helper.cc
index 6607c40..aaeb738 100644
--- a/icing/helpers/icu/icu-data-file-helper.cc
+++ b/icing/testing/icu-data-file-helper.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/icu-data-file-helper.h"
#include <sys/mman.h>
diff --git a/icing/helpers/icu/icu-data-file-helper.h b/icing/testing/icu-data-file-helper.h
index 90f5bc7..d0276e7 100644
--- a/icing/helpers/icu/icu-data-file-helper.h
+++ b/icing/testing/icu-data-file-helper.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
-#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#ifndef ICING_TESTING_ICU_DATA_FILE_HELPER
+#define ICING_TESTING_ICU_DATA_FILE_HELPER
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile(
} // namespace lib
} // namespace icing
-#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#endif // ICING_TESTING_ICU_DATA_FILE_HELPER
diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h
index 3165bf6..fd8d87b 100644
--- a/icing/testing/random-string.h
+++ b/icing/testing/random-string.h
@@ -15,6 +15,7 @@
#ifndef ICING_TESTING_RANDOM_STRING_H_
#define ICING_TESTING_RANDOM_STRING_H_
+#include <algorithm>
#include <random>
#include <string>
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 598ede7..8e0f789 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -59,34 +59,35 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
~IcuLanguageSegmenterIterator() {
ubrk_close(break_iterator_);
- utext_close(&u_text_);
+ utext_close(u_text_);
}
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
- // Prerequisite check
- if (term_end_index_exclusive_ == UBRK_DONE) {
- return false;
- }
+ while (true) {
+ // Prerequisite check
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return false;
+ }
- if (term_end_index_exclusive_ == 0) {
- // First Advance() call
- term_start_index_ = ubrk_first(break_iterator_);
- } else {
- term_start_index_ = term_end_index_exclusive_;
- }
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
+ if (term_end_index_exclusive_ == 0) {
+ // First Advance() call
+ term_start_index_ = ubrk_first(break_iterator_);
+ } else {
+ term_start_index_ = term_end_index_exclusive_;
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
- // Reached the end
- if (term_end_index_exclusive_ == UBRK_DONE) {
- MarkAsDone();
- return false;
- }
+ // Reached the end
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
+ return false;
+ }
- if (!IsValidSegment()) {
- return Advance();
+ if (IsValidSegment()) {
+ return true;
+ }
}
- return true;
}
// Returns the current term. It can be called only when Advance() returns
@@ -253,7 +254,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
: break_iterator_(nullptr),
text_(text),
locale_(locale),
- u_text_(UTEXT_INITIALIZER),
+ u_text_(nullptr),
offset_iterator_(text),
term_start_index_(0),
term_end_index_exclusive_(0) {}
@@ -261,10 +262,13 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Returns true on success
bool Initialize() {
UErrorCode status = U_ZERO_ERROR;
- utext_openUTF8(&u_text_, text_.data(), text_.length(), &status);
+ u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status);
+ if (u_text_ == nullptr) {
+ return false;
+ }
break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
/*textLength=*/0, &status);
- ubrk_setUText(break_iterator_, &u_text_, &status);
+ ubrk_setUText(break_iterator_, u_text_, &status);
return !U_FAILURE(status);
}
@@ -322,8 +326,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
std::string_view locale_;
// A thin wrapper around the input UTF8 text, needed by break_iterator_.
- // utext_close() must be called after using.
- UText u_text_;
+ // Allocated by calling utext_openUtf8() and freed by calling utext_close().
+ UText* u_text_;
// Offset iterator. This iterator is not guaranteed to point to any particular
// character, but is guaranteed to point to a valid UTF character sequence.
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 3090087..fe0b96e 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -21,8 +21,8 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index d293581..3aff45c 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -15,9 +15,9 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index bd86169..6f7d4df 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -14,8 +14,8 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index 13fe550..7a1949f 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -66,9 +66,9 @@ class PlainTokenIterator : public Tokenizer::Iterator {
Token GetToken() const override {
if (current_term_.empty()) {
- return Token(Token::INVALID);
+ return Token(Token::Type::INVALID);
}
- return Token(Token::REGULAR, current_term_);
+ return Token(Token::Type::REGULAR, current_term_);
}
libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
@@ -81,8 +81,8 @@ class PlainTokenIterator : public Tokenizer::Iterator {
return base_iterator_->CalculateTermEndExclusive();
}
- bool ResetToTokenAfter(int32_t offset) override {
- if (!base_iterator_->ResetToTermStartingAfterUtf32(offset).ok()) {
+ bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
+ if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) {
return false;
}
current_term_ = base_iterator_->GetTerm();
@@ -93,15 +93,17 @@ class PlainTokenIterator : public Tokenizer::Iterator {
return true;
}
- bool ResetToTokenBefore(int32_t offset) override {
+ bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
ICING_ASSIGN_OR_RETURN(
- offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false);
+ utf32_offset,
+ base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
current_term_ = base_iterator_->GetTerm();
while (!IsValidTerm(current_term_)) {
// Haven't found a valid term yet. Retrieve the term prior to this one
// from the segmenter.
ICING_ASSIGN_OR_RETURN(
- offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false);
+ utf32_offset,
+ base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
current_term_ = base_iterator_->GetTerm();
}
return true;
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index 7490bfa..c48b51e 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -18,9 +18,9 @@
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
@@ -68,26 +68,27 @@ TEST_F(PlainTokenizerTest, Simple) {
EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("Hello World"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
EXPECT_THAT(
plain_tokenizer->TokenizeAll(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
"Duis efficitur iaculis auctor."),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"),
- EqualsToken(Token::REGULAR, "ipsum"),
- EqualsToken(Token::REGULAR, "dolor"),
- EqualsToken(Token::REGULAR, "sit"),
- EqualsToken(Token::REGULAR, "amet"),
- EqualsToken(Token::REGULAR, "consectetur"),
- EqualsToken(Token::REGULAR, "adipiscing"),
- EqualsToken(Token::REGULAR, "elit"),
- EqualsToken(Token::REGULAR, "Duis"),
- EqualsToken(Token::REGULAR, "efficitur"),
- EqualsToken(Token::REGULAR, "iaculis"),
- EqualsToken(Token::REGULAR, "auctor"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Lorem"),
+ EqualsToken(Token::Type::REGULAR, "ipsum"),
+ EqualsToken(Token::Type::REGULAR, "dolor"),
+ EqualsToken(Token::Type::REGULAR, "sit"),
+ EqualsToken(Token::Type::REGULAR, "amet"),
+ EqualsToken(Token::Type::REGULAR, "consectetur"),
+ EqualsToken(Token::Type::REGULAR, "adipiscing"),
+ EqualsToken(Token::Type::REGULAR, "elit"),
+ EqualsToken(Token::Type::REGULAR, "Duis"),
+ EqualsToken(Token::Type::REGULAR, "efficitur"),
+ EqualsToken(Token::Type::REGULAR, "iaculis"),
+ EqualsToken(Token::Type::REGULAR, "auctor"))));
}
TEST_F(PlainTokenizerTest, Whitespace) {
@@ -107,16 +108,18 @@ TEST_F(PlainTokenizerTest, Whitespace) {
// 0x0009 is horizontal tab, considered as a whitespace
std::string text_with_horizontal_tab =
absl_ports::StrCat("Hello", UCharToString(0x0009), "World");
- EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
// 0x000B is vertical tab, considered as a whitespace
std::string text_with_vertical_tab =
absl_ports::StrCat("Hello", UCharToString(0x000B), "World");
- EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(text_with_vertical_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
}
TEST_F(PlainTokenizerTest, Punctuation) {
@@ -131,38 +134,39 @@ TEST_F(PlainTokenizerTest, Punctuation) {
language_segmenter.get()));
// Half-width punctuation marks are filtered out.
- EXPECT_THAT(plain_tokenizer->TokenizeAll(
- "Hello, World! Hello: World. \"Hello\" World?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"),
- EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"),
- EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(
+ "Hello, World! Hello: World. \"Hello\" World?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
// Full-width punctuation marks are filtered out.
std::vector<std::string_view> exp_tokens;
if (IsCfStringTokenization()) {
EXPECT_THAT(
plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你"),
- EqualsToken(Token::REGULAR, "好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你"),
- EqualsToken(Token::REGULAR, "好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你"),
- EqualsToken(Token::REGULAR, "好"),
- EqualsToken(Token::REGULAR, "世界"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"))));
} else {
EXPECT_THAT(
plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"))));
}
}
@@ -180,14 +184,16 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) {
// Right now we don't have special logic for these characters, just output
// them as tokens.
- EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"),
- EqualsToken(Token::REGULAR, "+"),
- EqualsToken(Token::REGULAR, "1"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("1+1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "1"),
+ EqualsToken(Token::Type::REGULAR, "+"),
+ EqualsToken(Token::Type::REGULAR, "1"))));
- EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"),
- EqualsToken(Token::REGULAR, "50"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("$50"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "$"),
+ EqualsToken(Token::Type::REGULAR, "50"))));
}
TEST_F(PlainTokenizerTest, CJKT) {
@@ -203,12 +209,13 @@ TEST_F(PlainTokenizerTest, CJKT) {
tokenizer_factory::CreateIndexingTokenizer(
StringIndexingConfig::TokenizerType::PLAIN,
language_segmenter.get()));
- EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"),
- EqualsToken(Token::REGULAR, "每天"),
- EqualsToken(Token::REGULAR, "走路"),
- EqualsToken(Token::REGULAR, "去"),
- EqualsToken(Token::REGULAR, "上班"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "我"),
+ EqualsToken(Token::Type::REGULAR, "每天"),
+ EqualsToken(Token::Type::REGULAR, "走路"),
+ EqualsToken(Token::Type::REGULAR, "去"),
+ EqualsToken(Token::Type::REGULAR, "上班"))));
// Japanese
options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE,
jni_cache_.get());
@@ -220,41 +227,44 @@ TEST_F(PlainTokenizerTest, CJKT) {
StringIndexingConfig::TokenizerType::PLAIN,
language_segmenter.get()));
if (IsCfStringTokenization()) {
- EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"),
- EqualsToken(Token::REGULAR, "は"),
- EqualsToken(Token::REGULAR, "毎日"),
- EqualsToken(Token::REGULAR, "仕事"),
- EqualsToken(Token::REGULAR, "に"),
- EqualsToken(Token::REGULAR, "歩い"),
- EqualsToken(Token::REGULAR, "て"),
- EqualsToken(Token::REGULAR, "い"),
- EqualsToken(Token::REGULAR, "ます"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
+ EqualsToken(Token::Type::REGULAR, "は"),
+ EqualsToken(Token::Type::REGULAR, "毎日"),
+ EqualsToken(Token::Type::REGULAR, "仕事"),
+ EqualsToken(Token::Type::REGULAR, "に"),
+ EqualsToken(Token::Type::REGULAR, "歩い"),
+ EqualsToken(Token::Type::REGULAR, "て"),
+ EqualsToken(Token::Type::REGULAR, "い"),
+ EqualsToken(Token::Type::REGULAR, "ます"))));
} else {
- EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"),
- EqualsToken(Token::REGULAR, "は"),
- EqualsToken(Token::REGULAR, "毎日"),
- EqualsToken(Token::REGULAR, "仕事"),
- EqualsToken(Token::REGULAR, "に"),
- EqualsToken(Token::REGULAR, "歩"),
- EqualsToken(Token::REGULAR, "い"),
- EqualsToken(Token::REGULAR, "てい"),
- EqualsToken(Token::REGULAR, "ます"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
+ EqualsToken(Token::Type::REGULAR, "は"),
+ EqualsToken(Token::Type::REGULAR, "毎日"),
+ EqualsToken(Token::Type::REGULAR, "仕事"),
+ EqualsToken(Token::Type::REGULAR, "に"),
+ EqualsToken(Token::Type::REGULAR, "歩"),
+ EqualsToken(Token::Type::REGULAR, "い"),
+ EqualsToken(Token::Type::REGULAR, "てい"),
+ EqualsToken(Token::Type::REGULAR, "ます"))));
}
// Khmer
- EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"),
- EqualsToken(Token::REGULAR, "ដើរទៅ"),
- EqualsToken(Token::REGULAR, "ធ្វើការ"),
- EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ"))));
- // Korean
EXPECT_THAT(
- plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"),
- EqualsToken(Token::REGULAR, "매일"),
- EqualsToken(Token::REGULAR, "출근합니다"))));
+ plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ញុំ"),
+ EqualsToken(Token::Type::REGULAR, "ដើរទៅ"),
+ EqualsToken(Token::Type::REGULAR, "ធ្វើការ"),
+ EqualsToken(Token::Type::REGULAR, "រាល់ថ្ងៃ"))));
+ // Korean
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::REGULAR, "나는"),
+ EqualsToken(Token::Type::REGULAR, "매일"),
+ EqualsToken(Token::Type::REGULAR, "출근합니다"))));
// Thai
// DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
@@ -264,23 +274,24 @@ TEST_F(PlainTokenizerTest, CJKT) {
std::vector<Token> tokens,
plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"));
- EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
- EqualsToken(Token::REGULAR, "เดิน"),
- EqualsToken(Token::REGULAR, "ไป"),
- EqualsToken(Token::REGULAR, "ทำงาน"),
- EqualsToken(Token::REGULAR, "ทุกวัน")));
+ EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
+ EqualsToken(Token::Type::REGULAR, "เดิน"),
+ EqualsToken(Token::Type::REGULAR, "ไป"),
+ EqualsToken(Token::Type::REGULAR, "ทำงาน"),
+ EqualsToken(Token::Type::REGULAR, "ทุกวัน")));
} else {
- EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
- EqualsToken(Token::REGULAR, "เดิน"),
- EqualsToken(Token::REGULAR, "ไป"),
- EqualsToken(Token::REGULAR, "ทำงาน"),
- EqualsToken(Token::REGULAR, "ทุก"),
- EqualsToken(Token::REGULAR, "วัน"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
+ EqualsToken(Token::Type::REGULAR, "เดิน"),
+ EqualsToken(Token::Type::REGULAR, "ไป"),
+ EqualsToken(Token::Type::REGULAR, "ทำงาน"),
+ EqualsToken(Token::Type::REGULAR, "ทุก"),
+ EqualsToken(Token::Type::REGULAR, "วัน"))));
}
}
-TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
+TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) {
language_segmenter_factory::SegmenterOptions options(ULOC_US,
jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -294,13 +305,13 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
constexpr std::string_view kText = "f b";
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
- EXPECT_TRUE(iterator->ResetToTokenAfter(0));
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b"));
+ EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "b"));
- EXPECT_FALSE(iterator->ResetToTokenAfter(2));
+ EXPECT_FALSE(iterator->ResetToTokenStartingAfter(2));
}
-TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
+TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) {
language_segmenter_factory::SegmenterOptions options(ULOC_US,
jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -314,13 +325,13 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
constexpr std::string_view kText = "f b";
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
- EXPECT_TRUE(iterator->ResetToTokenBefore(2));
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f"));
+ EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "f"));
- EXPECT_FALSE(iterator->ResetToTokenBefore(0));
+ EXPECT_FALSE(iterator->ResetToTokenEndingBefore(0));
}
-TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
+TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
language_segmenter_factory::SegmenterOptions options(ULOC_US,
jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -332,11 +343,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
- EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
- EqualsToken(Token::REGULAR, "bar"),
- EqualsToken(Token::REGULAR, "baz"),
- EqualsToken(Token::REGULAR, "bat"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
+ EqualsToken(Token::Type::REGULAR, "bar"),
+ EqualsToken(Token::Type::REGULAR, "baz"),
+ EqualsToken(Token::Type::REGULAR, "bat"))));
std::vector<std::string> expected_text = {
"foo", // 0: " foo . bar"
"bar", // 1: "foo . bar "
@@ -359,19 +371,19 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->Advance());
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo"));
for (int i = 0; i < kText.length(); ++i) {
if (i < expected_text.size()) {
- EXPECT_TRUE(iterator->ResetToTokenAfter(i));
+ EXPECT_TRUE(iterator->ResetToTokenStartingAfter(i));
EXPECT_THAT(iterator->GetToken(),
- EqualsToken(Token::REGULAR, expected_text[i]));
+ EqualsToken(Token::Type::REGULAR, expected_text[i]));
} else {
- EXPECT_FALSE(iterator->ResetToTokenAfter(i));
+ EXPECT_FALSE(iterator->ResetToTokenStartingAfter(i));
}
}
}
-TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
+TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
language_segmenter_factory::SegmenterOptions options(ULOC_US,
jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
@@ -383,11 +395,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
- EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
- EqualsToken(Token::REGULAR, "bar"),
- EqualsToken(Token::REGULAR, "baz"),
- EqualsToken(Token::REGULAR, "bat"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
+ EqualsToken(Token::Type::REGULAR, "bar"),
+ EqualsToken(Token::Type::REGULAR, "baz"),
+ EqualsToken(Token::Type::REGULAR, "bat"))));
std::vector<std::string> expected_text = {
"bat", // 20: "baz.. bat "
"baz", // 19: " baz.. bat"
@@ -410,15 +423,16 @@ TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->Advance());
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo"));
for (int i = kText.length() - 1; i >= 0; --i) {
int expected_index = kText.length() - 1 - i;
if (expected_index < expected_text.size()) {
- EXPECT_TRUE(iterator->ResetToTokenBefore(i));
- EXPECT_THAT(iterator->GetToken(),
- EqualsToken(Token::REGULAR, expected_text[expected_index]));
+ EXPECT_TRUE(iterator->ResetToTokenEndingBefore(i));
+ EXPECT_THAT(
+ iterator->GetToken(),
+ EqualsToken(Token::Type::REGULAR, expected_text[expected_index]));
} else {
- EXPECT_FALSE(iterator->ResetToTokenBefore(i));
+ EXPECT_FALSE(iterator->ResetToTokenEndingBefore(i));
}
}
}
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 2d461ee..8a27103 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -422,7 +422,7 @@ std::pair<TermType, std::string_view> GetTerm(std::string_view text,
// and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no
// valid token on its right.
void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) {
- if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) {
+ if (!tokens->empty() && tokens->back().type == Token::Type::QUERY_OR) {
tokens->pop_back();
}
}
@@ -436,11 +436,11 @@ libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) {
}
Token::Type last_token_type = tokens->back().type;
switch (last_token_type) {
- case Token::REGULAR:
- case Token::QUERY_RIGHT_PARENTHESES:
- tokens->emplace_back(Token::QUERY_OR);
+ case Token::Type::REGULAR:
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
+ tokens->emplace_back(Token::Type::QUERY_OR);
break;
- case Token::QUERY_OR:
+ case Token::Type::QUERY_OR:
// Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2"
break;
default:
@@ -481,21 +481,21 @@ libtextclassifier3::Status OutputToken(State new_state,
GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME));
}
}
- tokens->emplace_back(Token::QUERY_PROPERTY, current_term);
+ tokens->emplace_back(Token::Type::QUERY_PROPERTY, current_term);
} else {
- tokens->emplace_back(Token::REGULAR, current_term);
+ tokens->emplace_back(Token::Type::REGULAR, current_term);
}
break;
case LEFT_PARENTHESES:
- tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES);
+ tokens->emplace_back(Token::Type::QUERY_LEFT_PARENTHESES);
break;
case RIGHT_PARENTHESES:
// Ignores "OR" if it's followed by right parentheses.
RemoveLastTokenIfOrOperator(tokens);
- tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES);
+ tokens->emplace_back(Token::Type::QUERY_RIGHT_PARENTHESES);
break;
case EXCLUSION_OPERATOR:
- tokens->emplace_back(Token::QUERY_EXCLUSION);
+ tokens->emplace_back(Token::Type::QUERY_EXCLUSION);
break;
case OR_OPERATOR:
return OutputOrOperatorToken(tokens);
@@ -648,7 +648,7 @@ class RawQueryTokenIterator : public Tokenizer::Iterator {
Token GetToken() const override {
if (current_ < 0 || current_ >= tokens_.size()) {
- return Token(Token::INVALID);
+ return Token(Token::Type::INVALID);
}
return tokens_.at(current_);
}
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 500efa0..c6d981d 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -16,9 +16,9 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
@@ -59,13 +59,15 @@ TEST_F(RawQueryTokenizerTest, Simple) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("Hello World!"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"),
- EqualsToken(Token::REGULAR, "WORLD"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "hElLo"),
+ EqualsToken(Token::Type::REGULAR, "WORLD"))));
}
TEST_F(RawQueryTokenizerTest, Parentheses) {
@@ -80,82 +82,82 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term3"),
- EqualsToken(Token::REGULAR, "term4"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term3"),
+ EqualsToken(Token::Type::REGULAR, "term4"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
-
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)-term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)-term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -180,44 +182,49 @@ TEST_F(RawQueryTokenizerTest, Exclustion) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("- term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// Exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1- term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// Exclusion operator is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// First exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("--term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// First "-" is exclusion operator, second is not and will be discarded.
// In other words, exclusion only applies to the term right after it.
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1-term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -249,73 +256,75 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Colon is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll(":term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// Colon is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Colon is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1:"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// property name can be a path
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"),
- IsOkAndHolds(
- ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"),
- EqualsToken(Token::REGULAR, "hello"))));
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "email.title"),
+ EqualsToken(Token::Type::REGULAR, "hello"))));
// The first colon ":" triggers property restriction, the second colon is used
// as a word connector per ICU's rule
// (https://unicode.org/reports/tr29/#Word_Boundaries).
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property:foo:bar"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"),
- EqualsToken(Token::REGULAR, "foo:bar"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property"),
+ EqualsToken(Token::Type::REGULAR, "foo:bar"))));
// Property restriction only applies to the term right after it.
// Note: "term1:term2" is not a term but 2 terms because word connectors
// don't apply to numbers and alphabets.
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::REGULAR, "天气"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1-"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1-"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// Multiple continuous colons will still be recognized as a property
// restriction operator
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1::term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1::term1"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("property1:(term1)"),
@@ -345,105 +354,109 @@ TEST_F(RawQueryTokenizerTest, OR) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// Two continuous "OR"s are treated as one
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
-
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Only "OR" (all in uppercase) is the operator
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "or"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::REGULAR, "Or"),
- EqualsToken(Token::REGULAR, "term3"),
- EqualsToken(Token::REGULAR, "oR"),
- EqualsToken(Token::REGULAR, "term4"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "or"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::REGULAR, "Or"),
+ EqualsToken(Token::Type::REGULAR, "term3"),
+ EqualsToken(Token::Type::REGULAR, "oR"),
+ EqualsToken(Token::Type::REGULAR, "term4"))));
// "OR" is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("OR term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// "OR" is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 OR"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("term1 OR-term2"),
@@ -472,31 +485,31 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
if (IsCfStringTokenization()) {
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("-今天天气很好"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::REGULAR, "天气"),
- EqualsToken(Token::REGULAR, "很"),
- EqualsToken(Token::REGULAR, "好"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"),
+ EqualsToken(Token::Type::REGULAR, "很"),
+ EqualsToken(Token::Type::REGULAR, "好"))));
} else {
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("-今天天气很好"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::REGULAR, "天气"),
- EqualsToken(Token::REGULAR, "很好"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"),
+ EqualsToken(Token::Type::REGULAR, "很好"))));
}
if (IsCfStringTokenization()) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
- IsOkAndHolds(
- ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "你"),
- EqualsToken(Token::REGULAR, "好"))));
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"))));
} else {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
- IsOkAndHolds(
- ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "你好"))));
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "你好"))));
}
EXPECT_THAT(
@@ -504,10 +517,11 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
HasSubstr("Characters in property name must all be ASCII")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "ねこ"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "cat"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "ねこ"))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("cat ORねこ"),
@@ -543,40 +557,45 @@ TEST_F(RawQueryTokenizerTest, OtherChars) {
language_segmenter.get()));
// Comma is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll(",term1, ,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Exclusion operator and comma are ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// Colon and comma are ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
-
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ raw_query_tokenizer->TokenizeAll("property1:,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// This is a special case for OR, unknown chars are treated the same as
// whitespaces before and after OR.
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
}
TEST_F(RawQueryTokenizerTest, Mix) {
@@ -593,37 +612,38 @@ TEST_F(RawQueryTokenizerTest, Mix) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll(
"こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "こんにちは"),
- EqualsToken(Token::REGULAR, "good"),
- EqualsToken(Token::REGULAR, "afternoon"),
- EqualsToken(Token::QUERY_PROPERTY, "title"),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "ใน"),
- EqualsToken(Token::REGULAR, "วันนี้"),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "B12"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "こんにちは"),
+ EqualsToken(Token::Type::REGULAR, "good"),
+ EqualsToken(Token::Type::REGULAR, "afternoon"),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "ใน"),
+ EqualsToken(Token::Type::REGULAR, "วันนี้"),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "B12"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
} else {
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<Token> tokens,
raw_query_tokenizer->TokenizeAll(
"こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"));
- EXPECT_THAT(tokens,
- ElementsAre(EqualsToken(Token::REGULAR, "こんにちは"),
- EqualsToken(Token::REGULAR, "good"),
- EqualsToken(Token::REGULAR, "afternoon"),
- EqualsToken(Token::QUERY_PROPERTY, "title"),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "ใน"),
- EqualsToken(Token::REGULAR, "วัน"),
- EqualsToken(Token::REGULAR, "นี้"),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "B12"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")));
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "こんにちは"),
+ EqualsToken(Token::Type::REGULAR, "good"),
+ EqualsToken(Token::Type::REGULAR, "afternoon"),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "ใน"),
+ EqualsToken(Token::Type::REGULAR, "วัน"),
+ EqualsToken(Token::Type::REGULAR, "นี้"),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "B12"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
}
}
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index b936f2b..cb474c6 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -43,45 +43,46 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
- // Prerequisite check
- if (IsDone()) {
- return false;
- }
+ while (true) {
+ // Prerequisite check
+ if (IsDone()) {
+ return false;
+ }
- if (term_end_exclusive_.utf16_index() == 0) {
- int first = break_iterator_->First();
- if (!term_start_.MoveToUtf16(first)) {
- // First is guaranteed to succeed and return a position within bonds. So
- // the only possible failure could be an invalid sequence. Mark as DONE
- // and return.
+ if (term_end_exclusive_.utf16_index() == 0) {
+ int first = break_iterator_->First();
+ if (!term_start_.MoveToUtf16(first)) {
+ // First is guaranteed to succeed and return a position within bonds.
+ // So the only possible failure could be an invalid sequence. Mark as
+ // DONE and return.
+ MarkAsDone();
+ return false;
+ }
+ } else {
+ term_start_ = term_end_exclusive_;
+ }
+
+ int next_utf16_index_exclusive = break_iterator_->Next();
+ // Reached the end
+ if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
+ MarkAsDone();
+ return false;
+ }
+ if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
+ // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
+ // the check for kDone above. So the only possible failure could be an
+ // invalid sequence. Mark as DONE and return.
MarkAsDone();
return false;
}
- } else {
- term_start_ = term_end_exclusive_;
- }
-
- int next_utf16_index_exclusive = break_iterator_->Next();
- // Reached the end
- if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
- MarkAsDone();
- return false;
- }
- if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
- // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
- // the check for kDone above. So the only possible failure could be an
- // invalid sequence. Mark as DONE and return.
- MarkAsDone();
- return false;
- }
- // Check if the current term is valid. We consider any term valid if its
- // first character is valid. If it's not valid, then we need to advance to
- // the next term.
- if (IsValidTerm()) {
- return true;
+ // Check if the current term is valid. We consider any term valid if its
+ // first character is valid. If it's not valid, then we need to advance to
+ // the next term.
+ if (IsValidTerm()) {
+ return true;
+ }
}
- return Advance();
}
// Returns the current term. It can be called only when Advance() returns
diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h
index dda9efc..0c268be 100644
--- a/icing/tokenization/token.h
+++ b/icing/tokenization/token.h
@@ -21,11 +21,14 @@ namespace icing {
namespace lib {
struct Token {
- enum Type {
+ enum class Type {
// Common types
REGULAR, // A token without special meanings, the value of it will be
// indexed or searched directly
+ VERBATIM, // A token that should be indexed and searched without any
+ // modifications to the raw text
+
// Types only used in raw query
QUERY_OR, // Indicates OR logic between its left and right tokens
QUERY_EXCLUSION, // Indicates exclusion operation on next token
diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc
index 9b59acf..b2508f7 100644
--- a/icing/tokenization/tokenizer-factory.cc
+++ b/icing/tokenization/tokenizer-factory.cc
@@ -23,6 +23,7 @@
#include "icing/tokenization/plain-tokenizer.h"
#include "icing/tokenization/raw-query-tokenizer.h"
#include "icing/tokenization/tokenizer.h"
+#include "icing/tokenization/verbatim-tokenizer.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -38,6 +39,8 @@ CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type,
switch (type) {
case StringIndexingConfig::TokenizerType::PLAIN:
return std::make_unique<PlainTokenizer>(lang_segmenter);
+ case StringIndexingConfig::TokenizerType::VERBATIM:
+ return std::make_unique<VerbatimTokenizer>();
case StringIndexingConfig::TokenizerType::NONE:
[[fallthrough]];
default:
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index b4f0c6e..2bc18cc 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -43,6 +43,7 @@ class Tokenizer {
enum Type {
// Index tokenizers
PLAIN, // Used to tokenize plain text input
+ VERBATIM, // Used to tokenize the input text in verbatim
// Query tokenizers
RAW_QUERY, // Used to tokenize raw queries
@@ -83,22 +84,26 @@ class Tokenizer {
// offset.
// Ex.
// auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
- // iterator.ResetToTokenAfter(4);
+ // iterator.ResetToTokenStartingAfter(4);
// // The first full token starting after position 4 (the 'b' in "bar") is
// // "baz".
// PrintToken(iterator.GetToken()); // prints "baz"
- virtual bool ResetToTokenAfter(int32_t offset) { return false; }
+ virtual bool ResetToTokenStartingAfter(int32_t utf32_offset) {
+ return false;
+ }
// Sets the tokenizer to point at the first token that *ends* *before*
// offset. Returns false if there are no valid tokens ending
// before offset.
// Ex.
// auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
- // iterator.ResetToTokenBefore(4);
+ // iterator.ResetToTokenEndingBefore(4);
// // The first full token ending before position 4 (the 'b' in "bar") is
// // "foo".
// PrintToken(iterator.GetToken()); // prints "foo"
- virtual bool ResetToTokenBefore(int32_t offset) { return false; }
+ virtual bool ResetToTokenEndingBefore(int32_t utf32_offset) {
+ return false;
+ }
virtual bool ResetToStart() { return false; }
};
diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc
new file mode 100644
index 0000000..0d3a320
--- /dev/null
+++ b/icing/tokenization/verbatim-tokenizer.cc
@@ -0,0 +1,139 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/verbatim-tokenizer.h"
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/util/character-iterator.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+class VerbatimTokenIterator : public Tokenizer::Iterator {
+ public:
+ explicit VerbatimTokenIterator(std::string_view text)
+ : term_(std::move(text)) {}
+
+ bool Advance() override {
+ if (term_.empty() || has_advanced_to_end_) {
+ return false;
+ }
+
+ has_advanced_to_end_ = true;
+ return true;
+ }
+
+ Token GetToken() const override {
+ if (term_.empty() || !has_advanced_to_end_) {
+ return Token(Token::Type::INVALID);
+ }
+
+ return Token(Token::Type::VERBATIM, term_);
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
+ override {
+ if (term_.empty()) {
+ return absl_ports::AbortedError(
+ "Could not calculate start of empty token.");
+ }
+
+ return CharacterIterator(term_, 0, 0, 0);
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
+ override {
+ if (term_.empty()) {
+ return absl_ports::AbortedError(
+ "Could not calculate end of empty token.");
+ }
+
+ if (token_end_iterator_.utf8_index() >= 0) {
+ return token_end_iterator_;
+ }
+
+ bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
+ if (moved_to_token_end) {
+ return token_end_iterator_;
+ } else {
+ return absl_ports::AbortedError("Could not move to end of token.");
+ }
+ }
+
+ bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
+ // We can only reset to the sole verbatim token, so we must have a negative
+ // offset for it to be considered the token after.
+ if (utf32_offset < 0) {
+ // Because we are now at the sole verbatim token, we should ensure we can
+ // no longer advance past it.
+ has_advanced_to_end_ = true;
+ return true;
+ }
+ return false;
+ }
+
+ bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
+ // We can only reset to the sole verbatim token, so we must have an offset
+ // after the end of the token for the reset to be valid. This means the
+ // provided utf-32 offset must be equal to or greater than the utf-32 length
+ // of the token.
+ if (token_end_iterator_.utf8_index() < 0) {
+ // Moves one index past the end of the term.
+ bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
+ if (!moved_to_token_end) {
+ // We're unable to reset as we failed to move to the end of the term.
+ return false;
+ }
+ }
+
+ if (utf32_offset >= token_end_iterator_.utf32_index()) {
+ // Because we are now at the sole verbatim token, we should ensure we can
+ // no longer advance past it.
+ has_advanced_to_end_ = true;
+ return true;
+ }
+ return false;
+ }
+
+ bool ResetToStart() override {
+ has_advanced_to_end_ = true;
+ return true;
+ }
+
+ private:
+ std::string_view term_;
+ CharacterIterator token_end_iterator_ = CharacterIterator(term_, -1, -1, -1);
+ // Used to determine whether we have advanced on the sole verbatim token
+ bool has_advanced_to_end_ = false;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
+VerbatimTokenizer::Tokenize(std::string_view text) const {
+ return std::make_unique<VerbatimTokenIterator>(text);
+}
+
+libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll(
+ std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
+ Tokenize(text));
+ std::vector<Token> tokens;
+ while (iterator->Advance()) {
+ tokens.push_back(iterator->GetToken());
+ }
+ return tokens;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/verbatim-tokenizer.h b/icing/tokenization/verbatim-tokenizer.h
new file mode 100644
index 0000000..8404cf1
--- /dev/null
+++ b/icing/tokenization/verbatim-tokenizer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_VERBATIM_H_
+#define ICING_TOKENIZATION_VERBATIM_H_
+
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/tokenizer.h"
+
+namespace icing {
+namespace lib {
+
+// Provides verbatim tokenization on input text
+class VerbatimTokenizer : public Tokenizer {
+ public:
+ libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
+ std::string_view text) const override;
+
+ libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
+ std::string_view text) const override;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_VERBATIM_H_
diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc
new file mode 100644
index 0000000..e38c7aa
--- /dev/null
+++ b/icing/tokenization/verbatim-tokenizer_test.cc
@@ -0,0 +1,209 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string_view>
+
+#include "gmock/gmock.h"
+#include "icing/portable/platform.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/tokenizer-factory.h"
+#include "icing/util/character-iterator.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+class VerbatimTokenizerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ jni_cache_ = GetTestJniCache();
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+ }
+
+ std::unique_ptr<const JniCache> jni_cache_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+};
+
+TEST_F(VerbatimTokenizerTest, Empty) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ EXPECT_THAT(verbatim_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(VerbatimTokenizerTest, Simple) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ EXPECT_THAT(
+ verbatim_tokenizer->TokenizeAll("foo bar"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::VERBATIM, "foo bar"))));
+}
+
+TEST_F(VerbatimTokenizerTest, Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ EXPECT_THAT(verbatim_tokenizer->TokenizeAll("Hello, world!"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"))));
+}
+
+TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ // We should get an invalid token if we get the token before advancing.
+ EXPECT_THAT(token_iterator->GetToken(),
+ EqualsToken(Token::Type::INVALID, ""));
+}
+
+TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ // Reset to beginning of verbatim of token. We provide an offset of 13 as it
+ // is larger than the final index (12) of the verbatim token.
+ EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
+ EXPECT_THAT(token_iterator->GetToken(),
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
+
+ // Ensure our cached character iterator propertly maintains the end of the
+ // verbatim token.
+ EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
+ EXPECT_THAT(token_iterator->GetToken(),
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
+
+ // We should not be able to reset with an offset before or within
+ // the verbatim token's utf-32 length.
+ EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(0));
+ EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(12));
+}
+
+TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ // Get token without resetting
+ EXPECT_TRUE(token_iterator->Advance());
+ EXPECT_THAT(token_iterator->GetToken(),
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
+
+ // We expect a sole verbatim token, so it's not possible to reset after the
+ // start of the token.
+ EXPECT_FALSE(token_iterator->ResetToTokenStartingAfter(1));
+
+ // We expect to be reset to the sole verbatim token when the offset is
+ // negative.
+ EXPECT_TRUE(token_iterator->ResetToTokenStartingAfter(-1));
+ EXPECT_THAT(token_iterator->GetToken(),
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
+}
+
+TEST_F(VerbatimTokenizerTest, ResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ // Get token without resetting
+ EXPECT_TRUE(token_iterator->Advance());
+ EXPECT_THAT(token_iterator->GetToken(),
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
+
+ // Retrieve token again after resetting to start
+ EXPECT_TRUE(token_iterator->ResetToStart());
+ EXPECT_THAT(token_iterator->GetToken(),
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
+}
+
+TEST_F(VerbatimTokenizerTest, CalculateTokenStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator start_character_iterator,
+ token_iterator->CalculateTokenStart());
+
+ // We should retrieve the character 'H', the first character of the token.
+ EXPECT_THAT(start_character_iterator.GetCurrentChar(), Eq('H'));
+}
+
+TEST_F(VerbatimTokenizerTest, CalculateTokenEnd) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator end_character_iterator,
+ token_iterator->CalculateTokenEndExclusive());
+
+ // We should retrieve the the null character, as the returned character
+ // iterator will be set one past the end of the token.
+ EXPECT_THAT(end_character_iterator.GetCurrentChar(), Eq('\0'));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index 8d09be2..fdd4c70 100644
--- a/icing/transform/icu/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -14,8 +14,8 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index a46fcc7..143da17 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -16,8 +16,8 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc
index d483031..0ab1e50 100644
--- a/icing/util/character-iterator.cc
+++ b/icing/util/character-iterator.cc
@@ -49,6 +49,8 @@ bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
}
bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
+ ResetToStartIfNecessary();
+
if (desired_utf8_index > text_.length()) {
// Enforce the requirement.
return false;
@@ -120,6 +122,8 @@ bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
}
bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
+ ResetToStartIfNecessary();
+
UChar32 uchar32 = cached_current_char_;
while (utf16_index_ < desired_utf16_index) {
uchar32 =
@@ -190,6 +194,8 @@ bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
}
bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
+ ResetToStartIfNecessary();
+
UChar32 uchar32 = cached_current_char_;
while (utf32_index_ < desired_utf32_index) {
uchar32 =
@@ -249,5 +255,15 @@ bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
return true;
}
+void CharacterIterator::ResetToStartIfNecessary() {
+ if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
+ utf8_index_ = 0;
+ utf16_index_ = 0;
+ utf32_index_ = 0;
+ cached_current_char_ =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
+ }
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h
index c7569a7..893718a 100644
--- a/icing/util/character-iterator.h
+++ b/icing/util/character-iterator.h
@@ -99,6 +99,10 @@ class CharacterIterator {
}
private:
+ // Resets the character iterator to the start of the text if any of the
+ // indices are negative.
+ void ResetToStartIfNecessary();
+
std::string_view text_;
UChar32 cached_current_char_;
int utf8_index_;
diff --git a/icing/util/character-iterator_test.cc b/icing/util/character-iterator_test.cc
index 445f837..195a47b 100644
--- a/icing/util/character-iterator_test.cc
+++ b/icing/util/character-iterator_test.cc
@@ -231,5 +231,36 @@ TEST(CharacterIteratorTest, InvalidUtf) {
EXPECT_THAT(iterator, Eq(exp_iterator));
}
+TEST(CharacterIteratorTest, MoveToUtfNegativeIndex) {
+ constexpr std::string_view kText = "¿Dónde está la biblioteca?";
+
+ CharacterIterator iterator_utf8(kText, /*utf8_index=*/-1, /*utf16_index=*/0,
+ /*utf32_index=*/0);
+ // We should be able to successfully move when the index is negative.
+ EXPECT_THAT(iterator_utf8.MoveToUtf8(0), IsTrue());
+ // The character cache should be reset and contain the first character when
+ // resetting to index 0.
+ EXPECT_THAT(UCharToString(iterator_utf8.GetCurrentChar()), Eq("¿"));
+ EXPECT_THAT(iterator_utf8.utf8_index(), Eq(0));
+ EXPECT_THAT(iterator_utf8.utf16_index(), Eq(0));
+ EXPECT_THAT(iterator_utf8.utf32_index(), Eq(0));
+
+ CharacterIterator iterator_utf16(kText, /*utf8_index=*/0, /*utf16_index=*/-1,
+ /*utf32_index=*/0);
+ EXPECT_THAT(iterator_utf16.MoveToUtf16(1), IsTrue());
+ EXPECT_THAT(iterator_utf16.GetCurrentChar(), Eq('D'));
+ EXPECT_THAT(iterator_utf16.utf8_index(), Eq(2));
+ EXPECT_THAT(iterator_utf16.utf16_index(), Eq(1));
+ EXPECT_THAT(iterator_utf16.utf32_index(), Eq(1));
+
+ CharacterIterator iterator_utf32(kText, /*utf8_index=*/0, /*utf16_index=*/0,
+ /*utf32_index=*/-1);
+ EXPECT_THAT(iterator_utf32.MoveToUtf32(2), IsTrue());
+ EXPECT_THAT(UCharToString(iterator_utf32.GetCurrentChar()), Eq("ó"));
+ EXPECT_THAT(iterator_utf32.utf8_index(), Eq(3));
+ EXPECT_THAT(iterator_utf32.utf16_index(), Eq(2));
+ EXPECT_THAT(iterator_utf32.utf32_index(), Eq(2));
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc
index cb013d7..2261c37 100644
--- a/icing/util/document-validator_test.cc
+++ b/icing/util/document-validator_test.cc
@@ -46,15 +46,15 @@ constexpr char kPropertyEmails[] = "emails";
constexpr char kDefaultNamespace[] = "icing";
constexpr char kDefaultString[] = "This is a string.";
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
- PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED =
- PropertyConfigProto_Cardinality_Code_REQUIRED;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
- PropertyConfigProto_Cardinality_Code_REPEATED;
-
-constexpr PropertyConfigProto_DataType_Code TYPE_STRING =
- PropertyConfigProto_DataType_Code_STRING;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
+ PropertyConfigProto::Cardinality::REQUIRED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
+
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
class DocumentValidatorTest : public ::testing::Test {
protected:
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index cb28331..a46814c 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -53,7 +53,9 @@ import com.google.android.icing.proto.StringIndexingConfig;
import com.google.android.icing.proto.StringIndexingConfig.TokenizerType;
import com.google.android.icing.proto.SuggestionResponse;
import com.google.android.icing.proto.SuggestionSpecProto;
+import com.google.android.icing.proto.SuggestionSpecProto.SuggestionScoringSpecProto;
import com.google.android.icing.proto.TermMatchType;
+import com.google.android.icing.proto.TermMatchType.Code;
import com.google.android.icing.proto.UsageReport;
import com.google.android.icing.IcingSearchEngine;
import java.io.File;
@@ -650,7 +652,14 @@ public final class IcingSearchEngineTest {
assertStatusOk(icingSearchEngine.put(emailDocument2).getStatus());
SuggestionSpecProto suggestionSpec =
- SuggestionSpecProto.newBuilder().setPrefix("f").setNumToReturn(10).build();
+ SuggestionSpecProto.newBuilder()
+ .setPrefix("f")
+ .setNumToReturn(10)
+ .setScoringSpec(
+ SuggestionScoringSpecProto.newBuilder()
+ .setScoringMatchType(Code.EXACT_ONLY)
+ .build())
+ .build();
SuggestionResponse response = icingSearchEngine.searchSuggestions(suggestionSpec);
assertStatusOk(response.getStatus());
diff --git a/proto/icing/proto/debug.proto b/proto/icing/proto/debug.proto
new file mode 100644
index 0000000..504ae43
--- /dev/null
+++ b/proto/icing/proto/debug.proto
@@ -0,0 +1,127 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+import "icing/proto/schema.proto";
+import "icing/proto/status.proto";
+import "icing/proto/storage.proto";
+
+option java_package = "com.google.android.icing.proto";
+option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
+// Next tag: 4
+message IndexDebugInfoProto {
+ // Storage information of the index.
+ optional IndexStorageInfoProto index_storage_info = 1;
+
+ message MainIndexDebugInfoProto {
+ // Information about the main lexicon.
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+ optional string lexicon_info = 1;
+
+ // Last added document id.
+ optional uint32 last_added_document_id = 2;
+
+ // If verbosity > 0, return information about the posting list storage.
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+ optional string flash_index_storage_info = 3;
+ }
+ optional MainIndexDebugInfoProto main_index_info = 2;
+
+ message LiteIndexDebugInfoProto {
+ // Current number of hits.
+ optional uint32 curr_size = 1;
+
+ // The maximum possible number of hits.
+ optional uint32 hit_buffer_size = 2;
+
+ // Last added document id.
+ optional uint32 last_added_document_id = 3;
+
+ // The first position in the hit buffer that is not sorted yet,
+ // or curr_size if all hits are sorted.
+ optional uint32 searchable_end = 4;
+
+ // The most recent checksum of the lite index, by calling
+ // LiteIndex::ComputeChecksum().
+ optional uint32 index_crc = 5;
+
+ // Information about the lite lexicon.
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+ optional string lexicon_info = 6;
+ }
+ optional LiteIndexDebugInfoProto lite_index_info = 3;
+}
+
+// Next tag: 4
+message DocumentDebugInfoProto {
+ // Storage information of the document store.
+ optional DocumentStorageInfoProto document_storage_info = 1;
+
+ // The most recent checksum of the document store, by calling
+ // DocumentStore::ComputeChecksum().
+ optional uint32 crc = 2;
+
+ message CorpusInfo {
+ optional string namespace = 1;
+ optional string schema = 2;
+ optional uint32 total_documents = 3;
+ optional uint32 total_token = 4;
+ }
+
+ // If verbosity > 0, return the total number of documents and tokens in each
+ // (namespace, schema type) pair.
+ // Note that deleted and expired documents are skipped in the output.
+ repeated CorpusInfo corpus_info = 3;
+}
+
+// Next tag: 3
+message SchemaDebugInfoProto {
+ // Copy of the SchemaProto if it has been set in the schema store.
+ // Modifying this does not affect the Schema that IcingSearchEngine holds.
+ optional SchemaProto schema = 1;
+
+ // The most recent checksum of the schema store, by calling
+ // SchemaStore::ComputeChecksum().
+ optional uint32 crc = 2;
+}
+
+// Next tag: 4
+message DebugInfoProto {
+ // Debug information of the index.
+ optional IndexDebugInfoProto index_info = 1;
+
+ // Debug information of the document store.
+ optional DocumentDebugInfoProto document_info = 2;
+
+ // Debug information of the schema store.
+ optional SchemaDebugInfoProto schema_info = 3;
+}
+
+// Next tag: 3
+message DebugInfoResultProto {
+ // Status code can be one of:
+ // OK
+ // FAILED_PRECONDITION
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+
+ // Debug information for Icing.
+ optional DebugInfoProto debug_info = 2;
+}
diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto
index 2e8321b..1a501e7 100644
--- a/proto/icing/proto/document.proto
+++ b/proto/icing/proto/document.proto
@@ -209,7 +209,7 @@ message DeleteBySchemaTypeResultProto {
}
// Result of a call to IcingSearchEngine.DeleteByQuery
-// Next tag: 4
+// Next tag: 5
message DeleteByQueryResultProto {
// Status code can be one of:
// OK
@@ -226,5 +226,18 @@ message DeleteByQueryResultProto {
// Stats for delete execution performance.
optional DeleteByQueryStatsProto delete_by_query_stats = 3;
+ // Used by DeleteByQueryResultProto to return information about deleted
+ // documents.
+ message DocumentGroupInfo {
+ optional string namespace = 1;
+ optional string schema = 2;
+ repeated string uris = 3;
+ }
+
+ // Additional return message that shows the uris of the deleted documents, if
+ // users set return_deleted_document_info to true.
+ // The result is grouped by the corresponding namespace and type.
+ repeated DocumentGroupInfo deleted_documents = 4;
+
reserved 2;
}
diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto
index ab2556d..7fe1e6f 100644
--- a/proto/icing/proto/initialize.proto
+++ b/proto/icing/proto/initialize.proto
@@ -30,19 +30,6 @@ message IcingSearchEngineOptions {
// the index saved by the last instance.
optional string base_dir = 1;
- // The maximum number of tokens to be allowed per document. If a document
- // exceeds this number of tokens, then only the first max_tokens_per_doc
- // will be indexed.
- //
- // Clients may use this value to prevent the possibility of a select few
- // documents from exhausting limits in the index that are shared between all
- // documents (ie max allowed index size).
- //
- // Valid values: [1, INT_MAX], Current default is 1/5 of the default of
- // max_document_size.
- // Optional.
- optional int32 max_tokens_per_doc = 2 [default = 13107];
-
// The maximum allowable token length. All tokens in excess of this size
// will be truncated to max_token_length before being indexed.
//
@@ -70,6 +57,8 @@ message IcingSearchEngineOptions {
// Valid values: [1, INT_MAX]
// Optional.
optional int32 index_merge_size = 4 [default = 1048576]; // 1 MiB
+
+ reserved 2;
}
// Result of a call to IcingSearchEngine.Initialize
diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto
index 2f1f271..0a7c4a6 100644
--- a/proto/icing/proto/logging.proto
+++ b/proto/icing/proto/logging.proto
@@ -46,6 +46,9 @@ message InitializeStatsProto {
// Random I/O errors.
IO_ERROR = 4;
+
+ // The document log is using legacy format.
+ LEGACY_DOCUMENT_LOG_FORMAT = 5;
}
// Possible recovery causes for document store:
diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto
index c611cbf..ffb6f2c 100644
--- a/proto/icing/proto/schema.proto
+++ b/proto/icing/proto/schema.proto
@@ -91,6 +91,14 @@ message StringIndexingConfig {
// Tokenization for plain text.
PLAIN = 1;
+
+ // Tokenizes text in verbatim. This means no normalization or segmentation
+ // is applied to string values that are tokenized using this type.
+ // Therefore, the output token is equivalent to the raw string text. For
+ // example, "Hello, world!" would be tokenized as "Hello, world!"
+ // preserving punctuation and capitalization, and not creating separate
+ // tokens between the space.
+ VERBATIM = 2;
}
}
optional TokenizerType.Code tokenizer_type = 2;
diff --git a/proto/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto
index a3a64df..71c943e 100644
--- a/proto/icing/proto/scoring.proto
+++ b/proto/icing/proto/scoring.proto
@@ -116,8 +116,9 @@ message PropertyWeight {
// specified, the property weight is discarded.
optional string path = 1;
- // Property weight, valid values are positive. Zero and negative weights are
- // invalid and will result in an error. By default, a property is given a raw,
- // pre-normalized weight of 1.0.
+ // Property weight, valid values are positive and zero. Setting a zero
+ // property weight will remove scoring contribution for a query term match in
+ // the property. Negative weights are invalid and will result in an error.
+ // By default, a property is given a raw, pre-normalized weight of 1.0.
optional double weight = 2;
}
diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto
index c712ab2..f005c76 100644
--- a/proto/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto
@@ -85,16 +85,16 @@ message ResultSpecProto {
// have snippet information provided. If set to 0, snippeting is disabled.
optional int32 num_matches_per_property = 2;
- // How large of a window to provide. Windows start at max_window_bytes / 2
- // bytes before the middle of the matching token and end at max_window_bytes
- // / 2 bytes after the middle of the matching token. Windowing respects
- // token boundaries.
- // Therefore, the returned window may be smaller than requested. Setting
- // max_window_bytes to 0 will disable windowing information. If matches
- // enabled is also set to false, then snippeting is disabled.
- // Ex. max_window_bytes = 16. "foo bar baz bat rat" with a query of "baz"
+ // How large of a window to provide. Windows start at
+ // max_window_utf32_length / 2 bytes before the middle of the matching token
+ // and end at max_window_utf32_length / 2 bytes after the middle of the
+ // matching token. Windowing respects token boundaries. Therefore, the
+ // returned window may be smaller than requested. Setting
+ // max_window_utf32_length to 0 will disable windowing information. If
+ // matches enabled is also set to false, then snippeting is disabled. Ex.
+ // max_window_utf32_length = 16. "foo bar baz bat rat" with a query of "baz"
// will return a window of "bar baz bat" which is only 11 bytes long.
- optional int32 max_window_bytes = 3;
+ optional int32 max_window_utf32_length = 3;
}
optional SnippetSpecProto snippet_spec = 3;
@@ -309,7 +309,7 @@ message GetResultSpecProto {
repeated TypePropertyMask type_property_masks = 1;
}
-// Next tag: 4
+// Next tag: 5
message SuggestionSpecProto {
// REQUIRED: The "raw" prefix string that users may type. For example, "f"
// will search for suggested query that start with "f" like "foo", "fool".
@@ -323,6 +323,23 @@ message SuggestionSpecProto {
// REQUIRED: The number of suggestions to be returned.
optional int32 num_to_return = 3;
+
+ // Indicates how the suggestion terms should be scored and ranked.
+ message SuggestionScoringSpecProto {
+ // TermMatchType.Code=UNKNOWN
+ // Should never purposely be set and may lead to undefined behavior. This is
+ // used for backwards compatibility reasons.
+ //
+ // TermMatchType.Code=EXACT_ONLY
+ // Only exact hits will be counted to score a suggestion term.
+ //
+ // TermMatchType.Code=PREFIX
+ // Both exact hits and prefix hits will be counted to score a suggestion
+ // term.
+ optional TermMatchType.Code scoring_match_type = 1;
+ }
+
+ optional SuggestionScoringSpecProto scoring_spec = 4;
}
// Next tag: 3
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 7e0431b..73d349b 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=404879391)
+set(synced_AOSP_CL_number=436284873)