aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2022-04-12 14:30:14 -0700
committerTim Barron <tjbarron@google.com>2022-04-12 14:36:38 -0700
commitd5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54 (patch)
tree90b929dc92d5874b5c15caca064401196ab4fc65
parentbeff93fe1f5165aeeb871d9711963aa1846299ae (diff)
downloadicing-d5c9ae94052a0f2f1b9ddec9dbbe502bc4f11d54.tar.gz
Sync from upstream.
====================================================================== Refactor DocumentStore::Initialize to improve readability of document store recovery. ====================================================================== Remove non-NDK API usages of ICU4C in libicing. ====================================================================== Move IcuDataFileHelper to the testing directory since it is a test-only util. ====================================================================== Support dump function for DocumentStore ====================================================================== Switch to use PRead rather than MMap in the proto log. ====================================================================== Support dump function for main/lite index and lexicon ====================================================================== Fix LiteIndex::AppendHits ====================================================================== Enable and fix DocumentStoreTest.LoadScoreCacheAndInitializeSuccessfully ====================================================================== Fix MainIndex::GetStorageInfo. ====================================================================== Fix icing-search-engine_fuzz_test by making IcuLanguageSegmenterIterator::Advance non-recursive. ====================================================================== Allow to return additional information for deleted documents in DeleteByQuery ====================================================================== Using enum class in Token::Type for better type safety. ====================================================================== Bug: 158089703 Bug: 185845269 Bug: 209071710 Bug: 211785521 Bug: 218413237 Bug: 223549255 Change-Id: Id2786047ab279734bdd2aee883e82607b6a0e403
-rw-r--r--icing/file/file-backed-bitmap.cc10
-rw-r--r--icing/file/filesystem.h5
-rw-r--r--icing/file/portable-file-backed-proto-log.h133
-rw-r--r--icing/file/portable-file-backed-proto-log_benchmark.cc12
-rw-r--r--icing/file/portable-file-backed-proto-log_test.cc5
-rw-r--r--icing/icing-search-engine.cc88
-rw-r--r--icing/icing-search-engine.h5
-rw-r--r--icing/icing-search-engine_fuzz_test.cc2
-rw-r--r--icing/icing-search-engine_test.cc103
-rw-r--r--icing/index/index-processor_benchmark.cc2
-rw-r--r--icing/index/index-processor_test.cc2
-rw-r--r--icing/index/index.cc8
-rw-r--r--icing/index/index.h12
-rw-r--r--icing/index/index_test.cc52
-rw-r--r--icing/index/lite/lite-index.cc38
-rw-r--r--icing/index/lite/lite-index.h3
-rw-r--r--icing/index/lite/lite-index_test.cc110
-rw-r--r--icing/index/main/flash-index-storage.h1
-rw-r--r--icing/index/main/main-index.cc32
-rw-r--r--icing/index/main/main-index.h4
-rw-r--r--icing/legacy/index/icing-dynamic-trie.cc19
-rw-r--r--icing/legacy/index/icing-dynamic-trie.h9
-rw-r--r--icing/legacy/index/icing-filesystem.h5
-rw-r--r--icing/legacy/index/icing-flash-bitmap.h1
-rw-r--r--icing/query/query-processor_benchmark.cc2
-rw-r--r--icing/query/query-processor_test.cc2
-rw-r--r--icing/query/suggestion-processor_test.cc2
-rw-r--r--icing/result/result-retriever_test.cc6
-rw-r--r--icing/result/snippet-retriever.cc39
-rw-r--r--icing/result/snippet-retriever_test.cc2
-rw-r--r--icing/schema/schema-store.cc9
-rw-r--r--icing/store/document-log-creator.cc9
-rw-r--r--icing/store/document-log-creator.h12
-rw-r--r--icing/store/document-store.cc140
-rw-r--r--icing/store/document-store.h19
-rw-r--r--icing/store/document-store_test.cc218
-rw-r--r--icing/testing/icu-data-file-helper.cc (renamed from icing/helpers/icu/icu-data-file-helper.cc)2
-rw-r--r--icing/testing/icu-data-file-helper.h (renamed from icing/helpers/icu/icu-data-file-helper.h)6
-rw-r--r--icing/testing/random-string.h1
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc54
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc2
-rw-r--r--icing/tokenization/language-segmenter-iterator_test.cc2
-rw-r--r--icing/tokenization/language-segmenter_benchmark.cc2
-rw-r--r--icing/tokenization/plain-tokenizer.cc4
-rw-r--r--icing/tokenization/plain-tokenizer_test.cc248
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc22
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc524
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc67
-rw-r--r--icing/tokenization/token.h2
-rw-r--r--icing/tokenization/verbatim-tokenizer.cc4
-rw-r--r--icing/tokenization/verbatim-tokenizer_test.cc25
-rw-r--r--icing/transform/icu/icu-normalizer_benchmark.cc2
-rw-r--r--icing/transform/icu/icu-normalizer_test.cc2
-rw-r--r--proto/icing/proto/debug.proto112
-rw-r--r--proto/icing/proto/document.proto15
-rw-r--r--proto/icing/proto/logging.proto3
-rw-r--r--synced_AOSP_CL_number.txt2
57 files changed, 1469 insertions, 753 deletions
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc
index f1e568c..eec7668 100644
--- a/icing/file/file-backed-bitmap.cc
+++ b/icing/file/file-backed-bitmap.cc
@@ -50,7 +50,7 @@ FileBackedBitmap::Create(const Filesystem* filesystem,
auto bitmap = std::unique_ptr<FileBackedBitmap>(
new FileBackedBitmap(filesystem, file_path, mmap_strategy));
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = bitmap->Initialize();
if (!status.ok()) {
@@ -122,7 +122,7 @@ libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() {
<< " of size: " << file_size;
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, file_size);
if (!status.ok()) {
@@ -198,7 +198,7 @@ int FileBackedBitmap::NumBits() const {
libtextclassifier3::Status FileBackedBitmap::Set(int bit_index,
bool bit_value) {
if (bit_index >= NumBits()) {
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = GrowTo(bit_index);
if (!status.ok()) {
@@ -261,7 +261,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
file_path_.c_str(), new_file_size));
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
if (!status.ok()) {
@@ -281,7 +281,7 @@ libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) {
}
const size_t new_file_size = FileSizeForBits(new_num_bits);
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
if (!status.ok()) {
diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h
index ca8c4a8..dd2c5d1 100644
--- a/icing/file/filesystem.h
+++ b/icing/file/filesystem.h
@@ -233,6 +233,11 @@ class Filesystem {
// Increments to_increment by size if size is valid, or sets to_increment
// to kBadFileSize if either size or to_increment is kBadFileSize.
static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment);
+
+ // Return -1 if file_size is invalid. Otherwise, return file_size.
+ static int64_t SanitizeFileSize(int64_t file_size) {
+ return (file_size != kBadFileSize) ? file_size : -1;
+ }
};
// LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h)
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
index f676dc5..409ab96 100644
--- a/icing/file/portable-file-backed-proto-log.h
+++ b/icing/file/portable-file-backed-proto-log.h
@@ -124,6 +124,8 @@ class PortableFileBackedProtoLog {
public:
static constexpr int32_t kMagic = 0xf4c6f67a;
+ // We should go directly from 0 to 2 the next time we have to change the
+ // format.
static constexpr int32_t kFileFormatVersion = 0;
uint32_t CalculateHeaderChecksum() const {
@@ -282,7 +284,7 @@ class PortableFileBackedProtoLog {
// before updating our checksum.
bool recalculated_checksum = false;
- bool has_data_loss() {
+ bool has_data_loss() const {
return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
}
};
@@ -376,8 +378,7 @@ class PortableFileBackedProtoLog {
// }
class Iterator {
public:
- Iterator(const Filesystem& filesystem, const std::string& file_path,
- int64_t initial_offset);
+ Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset);
// Advances to the position of next proto whether it has been erased or not.
//
@@ -393,11 +394,12 @@ class PortableFileBackedProtoLog {
private:
static constexpr int64_t kInvalidOffset = -1;
// Used to read proto metadata
- MemoryMappedFile mmapped_file_;
// Offset of first proto
+ const Filesystem* const filesystem_;
int64_t initial_offset_;
int64_t current_offset_;
int64_t file_size_;
+ int fd_;
};
// Returns an iterator of current proto log. The caller needs to keep the
@@ -513,7 +515,7 @@ class PortableFileBackedProtoLog {
const Filesystem* filesystem, const std::string& file_path,
Crc32 initial_crc, int64_t start, int64_t end);
- // Reads out the metadata of a proto located at file_offset from the file.
+ // Reads out the metadata of a proto located at file_offset from the fd.
// Metadata will be returned in host byte order endianness.
//
// Returns:
@@ -521,7 +523,8 @@ class PortableFileBackedProtoLog {
// OUT_OF_RANGE_ERROR if file_offset exceeds file_size
// INTERNAL_ERROR if the metadata is invalid or any IO errors happen
static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
- MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
+ const Filesystem* const filesystem, int fd, int64_t file_offset,
+ int64_t file_size);
// Writes metadata of a proto to the fd. Takes in a host byte order endianness
// metadata and converts it into a portable metadata before writing.
@@ -937,35 +940,37 @@ template <typename ProtoT>
libtextclassifier3::StatusOr<ProtoT>
PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
- MemoryMappedFile mmapped_file(*filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_ONLY);
- if (file_offset >= file_size) {
- // file_size points to the next byte to write at, so subtract one to get
- // the inclusive, actual size of file.
- return absl_ports::OutOfRangeError(
- IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
- "out of range of the file size, %lld",
- static_cast<long long>(file_offset),
- static_cast<long long>(file_size - 1)));
- }
-
// Read out the metadata
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::OutOfRangeError("Unable to correctly read size.");
+ }
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(&mmapped_file, file_offset, file_size));
+ ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
// Copy out however many bytes it says the proto is
int stored_size = GetProtoSize(metadata);
+ file_offset += sizeof(metadata);
- ICING_RETURN_IF_ERROR(
- mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
+ // Read the compressed proto out.
+ if (file_offset + stored_size > file_size) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+ auto buf = std::make_unique<char[]>(stored_size);
+ if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
- if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+ if (IsEmptyBuffer(buf.get(), stored_size)) {
return absl_ports::NotFoundError("The proto data has been erased.");
}
- google::protobuf::io::ArrayInputStream proto_stream(
- mmapped_file.mutable_region(), stored_size);
+ google::protobuf::io::ArrayInputStream proto_stream(buf.get(),
+ stored_size);
// Deserialize proto
ProtoT proto;
@@ -983,33 +988,29 @@ template <typename ProtoT>
libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
int64_t file_offset) {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
- if (file_offset >= file_size) {
- // file_size points to the next byte to write at, so subtract one to get
- // the inclusive, actual size of file.
- return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
- "Trying to erase data at a location, %lld, "
- "out of range of the file size, %lld",
- static_cast<long long>(file_offset),
- static_cast<long long>(file_size - 1)));
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::OutOfRangeError("Unable to correctly read size.");
}
- MemoryMappedFile mmapped_file(
- *filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC);
-
- // Read out the metadata
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(&mmapped_file, file_offset, file_size));
-
- ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata),
- GetProtoSize(metadata)));
+ ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
+ // Copy out however many bytes it says the proto is
+ int stored_size = GetProtoSize(metadata);
+ file_offset += sizeof(metadata);
+ if (file_offset + stored_size > file_size) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+ auto buf = std::make_unique<char[]>(stored_size);
// We need to update the crc checksum if the erased area is before the
// rewind position.
int32_t new_crc;
- int64_t erased_proto_offset = file_offset + sizeof(metadata);
- if (erased_proto_offset < header_->GetRewindOffset()) {
+ if (file_offset < header_->GetRewindOffset()) {
// Set to "dirty" before we start writing anything.
header_->SetDirtyFlag(true);
header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -1022,24 +1023,30 @@ libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
// We need to calculate [original string xor 0s].
// The xored string is the same as the original string because 0 xor 0 =
// 0, 1 xor 0 = 1.
- const std::string_view xored_str(mmapped_file.region(),
- mmapped_file.region_size());
+ // Read the compressed proto out.
+ if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
+ const std::string_view xored_str(buf.get(), stored_size);
Crc32 crc(header_->GetLogChecksum());
ICING_ASSIGN_OR_RETURN(
- new_crc, crc.UpdateWithXor(
- xored_str,
- /*full_data_size=*/header_->GetRewindOffset() -
- kHeaderReservedBytes,
- /*position=*/erased_proto_offset - kHeaderReservedBytes));
+ new_crc,
+ crc.UpdateWithXor(xored_str,
+ /*full_data_size=*/header_->GetRewindOffset() -
+ kHeaderReservedBytes,
+ /*position=*/file_offset - kHeaderReservedBytes));
}
// Clear the region.
- memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size());
+ memset(buf.get(), '\0', stored_size);
+ if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
+ return absl_ports::InternalError("");
+ }
// If we cleared something in our checksummed area, we should update our
// checksum and reset our dirty bit.
- if (erased_proto_offset < header_->GetRewindOffset()) {
+ if (file_offset < header_->GetRewindOffset()) {
header_->SetDirtyFlag(false);
header_->SetLogChecksum(new_crc);
header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
@@ -1077,13 +1084,12 @@ PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
template <typename ProtoT>
PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
- const Filesystem& filesystem, const std::string& file_path,
- int64_t initial_offset)
- : mmapped_file_(filesystem, file_path,
- MemoryMappedFile::Strategy::READ_ONLY),
+ const Filesystem& filesystem, int fd, int64_t initial_offset)
+ : filesystem_(&filesystem),
initial_offset_(initial_offset),
current_offset_(kInvalidOffset),
- file_size_(filesystem.GetFileSize(file_path.c_str())) {
+ fd_(fd) {
+ file_size_ = filesystem_->GetFileSize(fd_);
if (file_size_ == Filesystem::kBadFileSize) {
// Fails all Advance() calls
file_size_ = 0;
@@ -1100,7 +1106,7 @@ PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
// Jumps to the next proto position
ICING_ASSIGN_OR_RETURN(
int32_t metadata,
- ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
+ ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
}
@@ -1122,14 +1128,15 @@ int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
template <typename ProtoT>
typename PortableFileBackedProtoLog<ProtoT>::Iterator
PortableFileBackedProtoLog<ProtoT>::GetIterator() {
- return Iterator(*filesystem_, file_path_,
+ return Iterator(*filesystem_, fd_.get(),
/*initial_offset=*/kHeaderReservedBytes);
}
template <typename ProtoT>
libtextclassifier3::StatusOr<int32_t>
PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
- MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) {
+ const Filesystem* const filesystem, int fd, int64_t file_offset,
+ int64_t file_size) {
// Checks file_offset
if (file_offset >= file_size) {
return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
@@ -1147,9 +1154,9 @@ PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
static_cast<long long>(file_size)));
}
- // Reads metadata
- ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
- memcpy(&portable_metadata, mmapped_file->region(), metadata_size);
+ if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
// Need to switch it back to host order endianness after reading from disk.
int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc
index f83ccd6..80a8011 100644
--- a/icing/file/portable-file-backed-proto-log_benchmark.cc
+++ b/icing/file/portable-file-backed-proto-log_benchmark.cc
@@ -55,7 +55,7 @@ namespace lib {
namespace {
-static void BM_Write(benchmark::State& state) {
+void BM_Write(benchmark::State& state) {
const Filesystem filesystem;
int string_length = state.range(0);
const std::string file_path = IcingStringUtil::StringPrintf(
@@ -108,7 +108,7 @@ BENCHMARK(BM_Write)
// 16MiB, and we need some extra space for the
// rest of the document properties
-static void BM_Read(benchmark::State& state) {
+void BM_Read(benchmark::State& state) {
const Filesystem filesystem;
int string_length = state.range(0);
const std::string file_path = IcingStringUtil::StringPrintf(
@@ -164,7 +164,7 @@ BENCHMARK(BM_Read)
// 16MiB, and we need some extra space for the
// rest of the document properties
//
-static void BM_Erase(benchmark::State& state) {
+void BM_Erase(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = IcingStringUtil::StringPrintf(
"%s%s", GetTestTempDir().c_str(), "/proto.log");
@@ -204,7 +204,7 @@ static void BM_Erase(benchmark::State& state) {
}
BENCHMARK(BM_Erase);
-static void BM_ComputeChecksum(benchmark::State& state) {
+void BM_ComputeChecksum(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = GetTestTempDir() + "/proto.log";
int max_proto_size = (1 << 24) - 1; // 16 MiB
@@ -246,7 +246,7 @@ static void BM_ComputeChecksum(benchmark::State& state) {
}
BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
-static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
+void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = GetTestTempDir() + "/proto.log";
int max_proto_size = (1 << 24) - 1; // 16 MiB
@@ -290,7 +290,7 @@ static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
}
BENCHMARK(BM_ComputeChecksumWithCachedChecksum);
-static void BM_ComputeChecksumOnlyForTail(benchmark::State& state) {
+void BM_ComputeChecksumOnlyForTail(benchmark::State& state) {
const Filesystem filesystem;
const std::string file_path = GetTestTempDir() + "/proto.log";
int max_proto_size = (1 << 24) - 1; // 16 MiB
diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
index b5fee4b..795271a 100644
--- a/icing/file/portable-file-backed-proto-log_test.cc
+++ b/icing/file/portable-file-backed-proto-log_test.cc
@@ -851,11 +851,12 @@ TEST_F(PortableFileBackedProtoLogTest, Iterator) {
{
// Iterator with bad filesystem
+ ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str()));
MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, GetFileSize(A<const char*>()))
+ ON_CALL(mock_filesystem, GetFileSize(A<int>()))
.WillByDefault(Return(Filesystem::kBadFileSize));
PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
- mock_filesystem, file_path_, /*initial_offset=*/0);
+ mock_filesystem, sfd.get(), /*initial_offset=*/0);
ASSERT_THAT(bad_iterator.Advance(),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 6acdd31..952ba21 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -18,6 +18,7 @@
#include <memory>
#include <string>
#include <string_view>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -88,6 +89,23 @@ constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
// fresh state.
constexpr int kMaxUnsuccessfulInitAttempts = 5;
+// A pair that holds namespace and type.
+struct NamespaceTypePair {
+ std::string namespace_;
+ std::string type;
+
+ bool operator==(const NamespaceTypePair& other) const {
+ return namespace_ == other.namespace_ && type == other.type;
+ }
+};
+
+struct NamespaceTypePairHasher {
+ std::size_t operator()(const NamespaceTypePair& pair) const {
+ return std::hash<std::string>()(pair.namespace_) ^
+ std::hash<std::string>()(pair.type);
+ }
+};
+
libtextclassifier3::Status ValidateResultSpec(
const ResultSpecProto& result_spec) {
if (result_spec.num_per_page() < 0) {
@@ -255,6 +273,28 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
status_proto->set_message(internal_status.error_message());
}
+libtextclassifier3::Status RetrieveAndAddDocumentInfo(
+ const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
+ std::unordered_map<NamespaceTypePair,
+ DeleteByQueryResultProto::DocumentGroupInfo*,
+ NamespaceTypePairHasher>& info_map,
+ DocumentId document_id) {
+ ICING_ASSIGN_OR_RETURN(DocumentProto document,
+ document_store->Get(document_id));
+ NamespaceTypePair key = {document.namespace_(), document.schema()};
+ auto iter = info_map.find(key);
+ if (iter == info_map.end()) {
+ auto entry = result_proto.add_deleted_documents();
+ entry->set_namespace_(std::move(document.namespace_()));
+ entry->set_schema(std::move(document.schema()));
+ entry->add_uris(std::move(document.uri()));
+ info_map[key] = entry;
+ } else {
+ iter->second->add_uris(std::move(document.uri()));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
} // namespace
IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
@@ -443,8 +483,6 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
// last tried to set the schema.
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/true, initialize_stats));
- initialize_stats->set_document_store_recovery_cause(
- InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
// We're going to need to build the index from scratch. So just delete its
// files now.
@@ -934,7 +972,7 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = document_store_->Delete(name_space, uri);
if (!status.ok()) {
@@ -968,7 +1006,7 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteByNamespace(name_space);
@@ -1002,7 +1040,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteBySchemaType(schema_type);
@@ -1020,7 +1058,7 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
}
DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
- const SearchSpecProto& search_spec) {
+ const SearchSpecProto& search_spec, bool return_deleted_document_info) {
ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
<< " from doc store";
@@ -1074,12 +1112,27 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
ICING_VLOG(2) << "Deleting the docs that matched the query.";
int num_deleted = 0;
+ // A map used to group deleted documents.
+ // From the (namespace, type) pair to a list of uris.
+ std::unordered_map<NamespaceTypePair,
+ DeleteByQueryResultProto::DocumentGroupInfo*,
+ NamespaceTypePairHasher>
+ deleted_info_map;
component_timer = clock_->GetNewTimer();
while (query_results.root_iterator->Advance().ok()) {
ICING_VLOG(3) << "Deleting doc "
<< query_results.root_iterator->doc_hit_info().document_id();
++num_deleted;
+ if (return_deleted_document_info) {
+ status = RetrieveAndAddDocumentInfo(
+ document_store_.get(), result_proto, deleted_info_map,
+ query_results.root_iterator->doc_hit_info().document_id());
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+ }
status = document_store_->Delete(
query_results.root_iterator->doc_hit_info().document_id());
if (!status.ok()) {
@@ -1148,12 +1201,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer();
OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- if (before_size != Filesystem::kBadFileSize) {
- optimize_stats->set_storage_size_before(before_size);
- } else {
- // Set -1 as a sentinel value when failures occur.
- optimize_stats->set_storage_size_before(-1);
- }
+ optimize_stats->set_storage_size_before(
+ Filesystem::SanitizeFileSize(before_size));
// Flushes data to disk before doing optimization
auto status = InternalPersistToDisk(PersistType::FULL);
@@ -1230,12 +1279,8 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
optimize_status_file.Write(std::move(optimize_status));
int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- if (after_size != Filesystem::kBadFileSize) {
- optimize_stats->set_storage_size_after(after_size);
- } else {
- // Set -1 as a sentinel value when failures occur.
- optimize_stats->set_storage_size_after(-1);
- }
+ optimize_stats->set_storage_size_after(
+ Filesystem::SanitizeFileSize(after_size));
optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds());
TransformStatus(optimization_status, result_status);
@@ -1317,11 +1362,8 @@ StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
}
int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- if (index_size != Filesystem::kBadFileSize) {
- result.mutable_storage_info()->set_total_storage_size(index_size);
- } else {
- result.mutable_storage_info()->set_total_storage_size(-1);
- }
+ result.mutable_storage_info()->set_total_storage_size(
+ Filesystem::SanitizeFileSize(index_size));
*result.mutable_storage_info()->mutable_document_storage_info() =
document_store_->GetStorageInfo();
*result.mutable_storage_info()->mutable_schema_store_storage_info() =
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 0a79714..ff9c7fb 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -280,8 +280,9 @@ class IcingSearchEngine {
// NOT_FOUND if the query doesn't match any documents
// FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
- DeleteByQueryResultProto DeleteByQuery(const SearchSpecProto& search_spec)
- ICING_LOCKS_EXCLUDED(mutex_);
+ DeleteByQueryResultProto DeleteByQuery(
+ const SearchSpecProto& search_spec,
+ bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_);
// Retrieves, scores, ranks, and returns the results according to the specs.
// Results can be empty. If there're multiple pages of results,
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index 2a44f8a..bf486da 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -18,12 +18,12 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/document-builder.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/icing-search-engine.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/schema-builder.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 4f16bb6..7ed8885 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -27,7 +27,6 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
#include "icing/portable/endian.h"
#include "icing/portable/equals-proto.h"
@@ -46,6 +45,7 @@
#include "icing/store/document-log-creator.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/snippet-helpers.h"
@@ -3493,6 +3493,105 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) {
expected_search_result_proto));
}
+TEST_F(IcingSearchEngineTest, DeleteByQueryReturnInfo) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body3")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(7);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs to test the information is correctly grouped.
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ DeleteByQueryResultProto result_proto =
+ icing.DeleteByQuery(search_spec, true);
+ EXPECT_THAT(result_proto.status(), ProtoIsOk());
+ DeleteByQueryStatsProto exp_stats;
+ exp_stats.set_latency_ms(7);
+ exp_stats.set_num_documents_deleted(3);
+ exp_stats.set_query_length(search_spec.query().length());
+ exp_stats.set_num_terms(1);
+ exp_stats.set_num_namespaces_filtered(0);
+ exp_stats.set_num_schema_types_filtered(0);
+ exp_stats.set_parse_query_latency_ms(7);
+ exp_stats.set_document_removal_latency_ms(7);
+ EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats));
+
+ // Check that DeleteByQuery can return information for deleted documents.
+ DeleteByQueryResultProto::DocumentGroupInfo info1, info2;
+ info1.set_namespace_("namespace1");
+ info1.set_schema("Message");
+ info1.add_uris("uri1");
+ info2.set_namespace_("namespace2");
+ info2.set_schema("Message");
+ info2.add_uris("uri3");
+ info2.add_uris("uri2");
+ EXPECT_THAT(result_proto.deleted_documents(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2)));
+
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+}
+
TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) {
DocumentProto document1 =
DocumentBuilder()
@@ -8489,7 +8588,7 @@ TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) {
EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
Eq(InitializeStatsProto::NO_DATA_LOSS));
EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
- Eq(InitializeStatsProto::NONE));
+ Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT));
EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
Eq(InitializeStatsProto::NONE));
EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 6e072c7..1aad7d0 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -16,7 +16,6 @@
#include "gmock/gmock.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
#include "icing/legacy/core/icing-string-util.h"
@@ -24,6 +23,7 @@
#include "icing/schema/schema-util.h"
#include "icing/schema/section-manager.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index d21a06f..bd310de 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -30,7 +30,6 @@
#include "icing/absl_ports/str_join.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -49,6 +48,7 @@
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/random-string.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
diff --git a/icing/index/index.cc b/icing/index/index.cc
index 984dc78..02ba699 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -119,7 +119,7 @@ std::vector<TermMetadata> MergeAndRankTermMetadatas(
int total_est_hit_count =
lite_term_itr->hit_count + main_term_itr->hit_count;
PushToTermHeap(TermMetadata(std::move(lite_term_itr->content),
- total_est_hit_count),
+ total_est_hit_count),
num_to_return, merged_term_metadata_heap);
++lite_term_itr;
++main_term_itr;
@@ -259,11 +259,7 @@ Index::FindTermsByPrefix(const std::string& prefix, int num_to_return,
IndexStorageInfoProto Index::GetStorageInfo() const {
IndexStorageInfoProto storage_info;
int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str());
- if (directory_size != Filesystem::kBadFileSize) {
- storage_info.set_index_size(directory_size);
- } else {
- storage_info.set_index_size(-1);
- }
+ storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size));
storage_info = lite_index_->GetStorageInfo(std::move(storage_info));
return main_index_->GetStorageInfo(std::move(storage_info));
}
diff --git a/icing/index/index.h b/icing/index/index.h
index 4aa9ba3..5c53349 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -32,6 +32,7 @@
#include "icing/index/term-id-codec.h"
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
@@ -143,9 +144,14 @@ class Index {
// index.
// verbosity > 0, more detailed debug information including raw postings
// lists.
- void GetDebugInfo(int verbosity, std::string* out) const {
- lite_index_->GetDebugInfo(verbosity, out);
- main_index_->GetDebugInfo(verbosity, out);
+ IndexDebugInfoProto GetDebugInfo(int verbosity) const {
+ IndexDebugInfoProto debug_info;
+ *debug_info.mutable_index_storage_info() = GetStorageInfo();
+ *debug_info.mutable_lite_index_info() =
+ lite_index_->GetDebugInfo(verbosity);
+ *debug_info.mutable_main_index_info() =
+ main_index_->GetDebugInfo(verbosity);
+ return debug_info;
}
// Returns the byte size of the all the elements held in the index. This
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index cd64202..8355c01 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -31,6 +31,7 @@
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
@@ -1394,12 +1395,14 @@ TEST_F(IndexTest, GetDebugInfo) {
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId1);
ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
ICING_ASSERT_OK(index_->Merge());
edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId2);
ASSERT_THAT(edit.BufferTerm("footer"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
@@ -1407,40 +1410,45 @@ TEST_F(IndexTest, GetDebugInfo) {
ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- std::string out0;
- index_->GetDebugInfo(/*verbosity=*/0, &out0);
- EXPECT_THAT(out0, Not(IsEmpty()));
+ IndexDebugInfoProto out0 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_FALSE(out0.main_index_info().has_flash_index_storage_info());
+ EXPECT_THAT(out0.main_index_info().last_added_document_id(),
+ Eq(kDocumentId1));
+ EXPECT_THAT(out0.lite_index_info().curr_size(), Eq(2));
+ EXPECT_THAT(out0.lite_index_info().last_added_document_id(),
+ Eq(kDocumentId2));
- std::string out1;
- index_->GetDebugInfo(/*verbosity=*/1, &out1);
- EXPECT_THAT(out1, SizeIs(Gt(out0.size())));
+ IndexDebugInfoProto out1 = index_->GetDebugInfo(/*verbosity=*/1);
+ EXPECT_THAT(out1.main_index_info().flash_index_storage_info(),
+ Not(IsEmpty()));
// Add one more doc to the lite index. Debug strings should change.
edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId3);
ASSERT_THAT(edit.BufferTerm("far"), IsOk());
EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- std::string out2;
- index_->GetDebugInfo(/*verbosity=*/0, &out2);
- EXPECT_THAT(out2, Ne(out0));
-
- std::string out3;
- index_->GetDebugInfo(/*verbosity=*/1, &out3);
- EXPECT_THAT(out3, Ne(out1));
+ IndexDebugInfoProto out2 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_THAT(out2.lite_index_info().curr_size(), Eq(3));
+ EXPECT_THAT(out2.lite_index_info().last_added_document_id(),
+ Eq(kDocumentId3));
// Merge into the man index. Debuug strings should change again.
ICING_ASSERT_OK(index_->Merge());
- std::string out4;
- index_->GetDebugInfo(/*verbosity=*/0, &out4);
- EXPECT_THAT(out4, Ne(out0));
- EXPECT_THAT(out4, Ne(out2));
-
- std::string out5;
- index_->GetDebugInfo(/*verbosity=*/1, &out5);
- EXPECT_THAT(out5, Ne(out1));
- EXPECT_THAT(out5, Ne(out3));
+ IndexDebugInfoProto out3 = index_->GetDebugInfo(/*verbosity=*/0);
+ EXPECT_TRUE(out3.has_index_storage_info());
+ EXPECT_THAT(out3.main_index_info().lexicon_info(), Not(IsEmpty()));
+ EXPECT_THAT(out3.main_index_info().last_added_document_id(),
+ Eq(kDocumentId3));
+ EXPECT_THAT(out3.lite_index_info().curr_size(), Eq(0));
+ EXPECT_THAT(out3.lite_index_info().hit_buffer_size(), Gt(0));
+ EXPECT_THAT(out3.lite_index_info().last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ EXPECT_THAT(out3.lite_index_info().searchable_end(), Eq(0));
+ EXPECT_THAT(out3.lite_index_info().index_crc(), Gt(0));
+ EXPECT_THAT(out3.lite_index_info().lexicon_info(), Not(IsEmpty()));
}
TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) {
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index e7a8cb3..a5c6baf 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -340,6 +340,8 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
std::vector<DocHitInfo>* hits_out) {
int count = 0;
DocumentId last_document_id = kInvalidDocumentId;
+ // Record whether the last document belongs to the given namespaces.
+ bool last_document_in_namespace = false;
for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
TermIdHitPair term_id_hit_pair(
hit_buffer_.array_cast<TermIdHitPair>()[idx]);
@@ -357,9 +359,10 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
DocumentId document_id = hit.document_id();
if (document_id != last_document_id) {
last_document_id = document_id;
- // Check does current document belongs to the given namespaces.
- if (namespace_checker != nullptr &&
- !namespace_checker->BelongsToTargetNamespaces(document_id)) {
+ last_document_in_namespace =
+ namespace_checker == nullptr ||
+ namespace_checker->BelongsToTargetNamespaces(document_id);
+ if (!last_document_in_namespace) {
// The document is removed or expired or not belongs to target
// namespaces.
continue;
@@ -369,7 +372,7 @@ int LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
hits_out->push_back(DocHitInfo(document_id));
}
}
- if (hits_out != nullptr) {
+ if (hits_out != nullptr && last_document_in_namespace) {
hits_out->back().UpdateSection(hit.section_id(), hit.term_frequency());
}
}
@@ -388,15 +391,16 @@ bool LiteIndex::is_full() const {
lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
}
-void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
- absl_ports::StrAppend(
- out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n",
- header_->cur_size(),
- options_.hit_buffer_size));
-
- // Lexicon.
- out->append("Lexicon stats:\n");
- lexicon_.GetDebugInfo(verbosity, out);
+IndexDebugInfoProto::LiteIndexDebugInfoProto LiteIndex::GetDebugInfo(
+ int verbosity) {
+ IndexDebugInfoProto::LiteIndexDebugInfoProto res;
+ res.set_curr_size(header_->cur_size());
+ res.set_hit_buffer_size(options_.hit_buffer_size);
+ res.set_last_added_document_id(header_->last_added_docid());
+ res.set_searchable_end(header_->searchable_end());
+ res.set_index_crc(ComputeChecksum().Get());
+ lexicon_.GetDebugInfo(verbosity, res.mutable_lexicon_info());
+ return res;
}
libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
@@ -417,12 +421,8 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
int64_t header_and_hit_buffer_file_size =
filesystem_->GetFileSize(hit_buffer_fd_.get());
- if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) {
- storage_info.set_lite_index_hit_buffer_size(
- header_and_hit_buffer_file_size);
- } else {
- storage_info.set_lite_index_hit_buffer_size(-1);
- }
+ storage_info.set_lite_index_hit_buffer_size(
+ IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
if (lexicon_disk_usage != Filesystem::kBadFileSize) {
storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
index 890980c..378fc94 100644
--- a/icing/index/lite/lite-index.h
+++ b/icing/index/lite/lite-index.h
@@ -37,6 +37,7 @@
#include "icing/legacy/index/icing-lite-index-header.h"
#include "icing/legacy/index/icing-lite-index-options.h"
#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
@@ -241,7 +242,7 @@ class LiteIndex {
// Returns debug information for the index in out.
// verbosity <= 0, simplest debug information - size of lexicon, hit buffer
// verbosity > 0, more detailed debug information from the lexicon.
- void GetDebugInfo(int verbosity, std::string* out) const;
+ IndexDebugInfoProto::LiteIndexDebugInfoProto GetDebugInfo(int verbosity);
// Returns the byte size of all the elements held in the index. This excludes
// the size of any internal metadata of the index, e.g. the index's header.
diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc
new file mode 100644
index 0000000..825f830
--- /dev/null
+++ b/icing/index/lite/lite-index_test.cc
@@ -0,0 +1,110 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/lite-index.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-checker.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+class AlwaysFalseNamespaceCheckerImpl : public NamespaceChecker {
+ public:
+ bool BelongsToTargetNamespaces(DocumentId document_id) const override {
+ return false;
+ }
+};
+
+class LiteIndexTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(LiteIndexTest, LiteIndexAppendHits) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1));
+
+ std::vector<DocHitInfo> hits1;
+ lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ /*namespace_checker=*/nullptr, &hits1);
+ EXPECT_THAT(hits1, SizeIs(1));
+ EXPECT_THAT(hits1.back().document_id(), Eq(0));
+ // Check that the hits are coming from section 0 and section 1.
+ EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11));
+
+ std::vector<DocHitInfo> hits2;
+ AlwaysFalseNamespaceCheckerImpl always_false_namespace_checker;
+ lite_index_->AppendHits(foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ &always_false_namespace_checker, &hits2);
+ // Check that no hits are returned because they get skipped by the namespace
+ // checker.
+ EXPECT_THAT(hits2, IsEmpty());
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/flash-index-storage.h b/icing/index/main/flash-index-storage.h
index 8d5b50b..6c6fbb8 100644
--- a/icing/index/main/flash-index-storage.h
+++ b/icing/index/main/flash-index-storage.h
@@ -159,6 +159,7 @@ class FlashIndexStorage {
libtextclassifier3::Status Reset();
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
void GetDebugInfo(int verbosity, std::string* out) const;
private:
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
index c104dd3..2d6007b 100644
--- a/icing/index/main/main-index.cc
+++ b/icing/index/main/main-index.cc
@@ -133,18 +133,10 @@ libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
IndexStorageInfoProto MainIndex::GetStorageInfo(
IndexStorageInfoProto storage_info) const {
- int64_t lexicon_elt_size = main_lexicon_->GetElementsSize();
- if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
- storage_info.set_main_index_lexicon_size(lexicon_elt_size);
- } else {
- storage_info.set_main_index_lexicon_size(-1);
- }
- int64_t index_elt_size = flash_index_storage_->GetElementsSize();
- if (lexicon_elt_size != IcingFilesystem::kBadFileSize) {
- storage_info.set_main_index_storage_size(index_elt_size);
- } else {
- storage_info.set_main_index_storage_size(-1);
- }
+ storage_info.set_main_index_lexicon_size(
+ IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
+ storage_info.set_main_index_storage_size(
+ Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
storage_info.set_main_index_block_size(flash_index_storage_->block_size());
storage_info.set_num_blocks(flash_index_storage_->num_blocks());
storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
@@ -615,16 +607,22 @@ libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
return libtextclassifier3::Status::OK;
}
-void MainIndex::GetDebugInfo(int verbosity, std::string* out) const {
+IndexDebugInfoProto::MainIndexDebugInfoProto MainIndex::GetDebugInfo(
+ int verbosity) const {
+ IndexDebugInfoProto::MainIndexDebugInfoProto res;
+
// Lexicon.
- out->append("Main Lexicon stats:\n");
- main_lexicon_->GetDebugInfo(verbosity, out);
+ main_lexicon_->GetDebugInfo(verbosity, res.mutable_lexicon_info());
+
+ res.set_last_added_document_id(last_added_document_id());
if (verbosity <= 0) {
- return;
+ return res;
}
- flash_index_storage_->GetDebugInfo(verbosity, out);
+ flash_index_storage_->GetDebugInfo(verbosity,
+ res.mutable_flash_index_storage_info());
+ return res;
}
} // namespace lib
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
index e4073cb..abb0418 100644
--- a/icing/index/main/main-index.h
+++ b/icing/index/main/main-index.h
@@ -27,6 +27,7 @@
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
@@ -185,7 +186,8 @@ class MainIndex {
// verbosity <= 0, simplest debug information - just the lexicon
// verbosity > 0, more detailed debug information including raw postings
// lists.
- void GetDebugInfo(int verbosity, std::string* out) const;
+ IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo(
+ int verbosity) const;
private:
libtextclassifier3::Status Init(const std::string& index_directory,
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index baa043a..77876c4 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -70,6 +70,7 @@
#include <algorithm>
#include <cerrno>
#include <cinttypes>
+#include <cstdint>
#include <cstring>
#include <memory>
#include <utility>
@@ -397,6 +398,8 @@ class IcingDynamicTrie::IcingDynamicTrieStorage {
// storage.
IcingScopedFd array_fds_[NUM_ARRAY_TYPES];
std::vector<IcingArrayStorage> array_storage_;
+
+ // Legacy file system. Switch to use the new Filesystem class instead.
const IcingFilesystem *filesystem_;
};
@@ -1364,10 +1367,12 @@ uint32_t IcingDynamicTrie::size() const {
return storage_->hdr().num_keys();
}
-void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
- Stats *stats) const {
+void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats,
+ uint32_t depth) const {
if (node.is_leaf()) {
stats->num_leaves++;
+ stats->sum_depth += depth;
+ stats->max_depth = max(stats->max_depth, depth);
const char *suffix = storage_->GetSuffix(node.next_index());
stats->suffixes_used += strlen(suffix) + 1 + value_size();
if (!suffix[0]) {
@@ -1379,13 +1384,16 @@ void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
for (; i < (1U << node.log2_num_children()); i++) {
const Next &next = *storage_->GetNext(node.next_index(), i);
if (next.node_index() == kInvalidNodeIndex) break;
- CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats);
+ CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats,
+ depth + 1);
}
// At least one valid node in each next array
if (i == 0) {
ICING_LOG(FATAL) << "No valid node in 'next' array";
}
+ stats->sum_children += i;
+ stats->max_children = max(stats->max_children, i);
stats->child_counts[i - 1]++;
stats->wasted[node.log2_num_children()] +=
@@ -1467,9 +1475,12 @@ std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const {
"Wasted total: %u\n"
"Num intermediates %u num leaves %u "
"suffixes used %u null %u\n"
+ "avg and max children for intermediates: %.3f, %u\n"
+ "avg and max depth for leaves: %.3f, %u\n"
"Total next frag: %.3f%%\n",
total_wasted, num_intermediates, num_leaves, suffixes_used,
- null_suffixes,
+ null_suffixes, 1. * sum_children / num_intermediates, max_children,
+ 1. * sum_depth / num_leaves, max_depth,
100. * math_util::SafeDivide((total_free + total_wasted), num_nexts));
}
IcingStringUtil::SStringAppendF(
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index 8821799..013b926 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -152,8 +152,13 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t max_nodes;
// Count of intermediate nodes.
uint32_t num_intermediates;
+ // Total and maximum number of children of intermediate nodes.
+ uint32_t sum_children, max_children;
+
// Count of leaf nodes.
uint32_t num_leaves;
+ // Total and maximum depth of leaf nodes.
+ uint32_t sum_depth, max_depth;
// Next stats
@@ -186,6 +191,7 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t dirty_pages_nexts;
uint32_t dirty_pages_suffixes;
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
std::string DumpStats(int verbosity) const;
};
@@ -601,7 +607,8 @@ class IcingDynamicTrie : public IIcingStorage {
static const uint32_t kInvalidSuffixIndex;
// Stats helpers.
- void CollectStatsRecursive(const Node &node, Stats *stats) const;
+ void CollectStatsRecursive(const Node &node, Stats *stats,
+ uint32_t depth = 0) const;
// Helpers for Find and Insert.
const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h
index f645632..ce75a82 100644
--- a/icing/legacy/index/icing-filesystem.h
+++ b/icing/legacy/index/icing-filesystem.h
@@ -224,6 +224,11 @@ class IcingFilesystem {
// Increments to_increment by size if size is valid, or sets to_increment
// to kBadFileSize if either size or to_increment is kBadFileSize.
static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment);
+
+ // Return -1 if file_size is invalid. Otherwise, return file_size.
+ static int64_t SanitizeFileSize(int64_t file_size) {
+ return (file_size != kBadFileSize) ? file_size : -1;
+ }
};
} // namespace lib
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
index e3ba0e2..6bb9591 100644
--- a/icing/legacy/index/icing-flash-bitmap.h
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -138,6 +138,7 @@ class IcingFlashBitmap {
// Upgrade for version 18.
bool UpgradeTo18();
+ // Legacy file system. Switch to use the new Filesystem class instead.
const IcingFilesystem *const filesystem_;
std::string filename_;
OpenType open_type_;
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index bdd40aa..e48fe78 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -16,7 +16,6 @@
#include "gmock/gmock.h"
#include "third_party/absl/flags/flag.h"
#include "icing/document-builder.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index.h"
#include "icing/proto/term.pb.h"
#include "icing/query/query-processor.h"
@@ -24,6 +23,7 @@
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index ef12b10..950f739 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -23,7 +23,6 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
@@ -40,6 +39,7 @@
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc
index 2f22a47..ba4c90a 100644
--- a/icing/query/suggestion-processor_test.cc
+++ b/icing/query/suggestion-processor_test.cc
@@ -15,11 +15,11 @@
#include "icing/query/suggestion-processor.h"
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/store/document-store.h"
#include "icing/testing/always-true-namespace-checker-impl.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 2695433..0d812e4 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -22,7 +22,6 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -36,6 +35,7 @@
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) {
TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) {
MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false));
-
+ ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>()))
+ .WillByDefault(Return(false));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_,
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 6af7017..bd1524e 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -78,26 +78,25 @@ inline std::string AddIndexToPath(int values_size, int index,
// Returns a string of the normalized text of the input Token. Normalization
// is applied based on the Token's type.
-std::string NormalizeToken(const Normalizer& normalizer,
- const Token& token) {
+std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
switch (token.type) {
- case Token::REGULAR:
+ case Token::Type::REGULAR:
return normalizer.NormalizeTerm(token.text);
- case Token::VERBATIM:
+ case Token::Type::VERBATIM:
return std::string(token.text);
- case Token::QUERY_EXCLUSION:
+ case Token::Type::QUERY_EXCLUSION:
[[fallthrough]];
- case Token::QUERY_LEFT_PARENTHESES:
+ case Token::Type::QUERY_LEFT_PARENTHESES:
[[fallthrough]];
- case Token::QUERY_RIGHT_PARENTHESES:
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
[[fallthrough]];
- case Token::QUERY_OR:
+ case Token::Type::QUERY_OR:
[[fallthrough]];
- case Token::QUERY_PROPERTY:
+ case Token::Type::QUERY_PROPERTY:
[[fallthrough]];
- case Token::INVALID:
+ case Token::Type::INVALID:
ICING_LOG(WARNING) << "Unable to normalize token of type: "
- << token.type;
+ << static_cast<int>(token.type);
return std::string(token.text);
}
}
@@ -107,7 +106,7 @@ std::string NormalizeToken(const Normalizer& normalizer,
CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
const std::string& match_query_term) {
switch (token.type) {
- case Token::VERBATIM: {
+ case Token::Type::VERBATIM: {
// VERBATIM tokens are not normalized. This means the non-normalized
// matched query term must be either equal to or a prefix of the token's
// text. Therefore, the match must end at the end of the matched query
@@ -117,22 +116,22 @@ CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
verbatim_match_end.AdvanceToUtf8(match_query_term.length());
return verbatim_match_end;
}
- case Token::QUERY_EXCLUSION:
+ case Token::Type::QUERY_EXCLUSION:
[[fallthrough]];
- case Token::QUERY_LEFT_PARENTHESES:
+ case Token::Type::QUERY_LEFT_PARENTHESES:
[[fallthrough]];
- case Token::QUERY_RIGHT_PARENTHESES:
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
[[fallthrough]];
- case Token::QUERY_OR:
+ case Token::Type::QUERY_OR:
[[fallthrough]];
- case Token::QUERY_PROPERTY:
+ case Token::Type::QUERY_PROPERTY:
[[fallthrough]];
- case Token::INVALID:
+ case Token::Type::INVALID:
ICING_LOG(WARNING)
- << "Unexpected Token type " << token.type
+ << "Unexpected Token type " << static_cast<int>(token.type)
<< " found when finding match end of query term and token.";
[[fallthrough]];
- case Token::REGULAR:
+ case Token::Type::REGULAR:
return normalizer.FindNormalizedMatchEndPosition(token.text,
match_query_term);
}
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 365eba2..0de2295 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,7 +22,6 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -37,6 +36,7 @@
#include "icing/store/key-mapper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/snippet-helpers.h"
#include "icing/testing/test-data.h"
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index 67528ab..b5c976f 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -268,7 +268,7 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
schema_type_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete(
filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
@@ -464,11 +464,8 @@ libtextclassifier3::Status SchemaStore::PersistToDisk() {
SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
SchemaStoreStorageInfoProto storage_info;
int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str());
- if (directory_size != Filesystem::kBadFileSize) {
- storage_info.set_schema_store_size(directory_size);
- } else {
- storage_info.set_schema_store_size(-1);
- }
+ storage_info.set_schema_store_size(
+ Filesystem::SanitizeFileSize(directory_size));
ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
storage_info.set_num_schema_types(schema->types_size());
int total_sections = 0;
diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc
index 5e0426e..5e23a8e 100644
--- a/icing/store/document-log-creator.cc
+++ b/icing/store/document-log-creator.cc
@@ -72,19 +72,20 @@ DocumentLogCreator::Create(const Filesystem* filesystem,
bool v1_exists =
filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str());
- bool regen_derived_files = false;
+ bool new_file = false;
+ int preexisting_file_version = kCurrentVersion;
if (v0_exists && !v1_exists) {
ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir));
// Need to regenerate derived files since documents may be written to a
// different file offset in the log.
- regen_derived_files = true;
+ preexisting_file_version = 0;
} else if (!v1_exists) {
// First time initializing a v1 log. There are no existing derived files at
// this point, so we should generate some. "regenerate" here also means
// "generate for the first time", i.e. we shouldn't expect there to be any
// existing derived files.
- regen_derived_files = true;
+ new_file = true;
}
ICING_ASSIGN_OR_RETURN(
@@ -96,7 +97,7 @@ DocumentLogCreator::Create(const Filesystem* filesystem,
/*compress_in=*/true)));
CreateResult create_result = {std::move(log_create_result),
- regen_derived_files};
+ preexisting_file_version, new_file};
return create_result;
}
diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h
index 51cf497..be8feed 100644
--- a/icing/store/document-log-creator.h
+++ b/icing/store/document-log-creator.h
@@ -30,14 +30,20 @@ namespace lib {
// be necessary.
class DocumentLogCreator {
public:
+ // Version 0 refers to FileBackedProtoLog
+ // Version 1 refers to PortableFileBackedProtoLog with kFileFormatVersion = 0
+ static constexpr int32_t kCurrentVersion = 1;
struct CreateResult {
// The create result passed up from the PortableFileBackedProtoLog::Create.
// Contains the document log.
PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result;
- // Whether the caller needs to also regenerate/generate any derived files
- // based off of the initialized document log.
- bool regen_derived_files;
+ // The version number of the pre-existing document log file.
+ // If there is no document log file, it will be set to kCurrentVersion.
+ int preexisting_file_version;
+
+ // Whether the created file is new.
+ bool new_file;
};
// Creates the document log in the base_dir. Will create one if it doesn't
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 226a96b..a2ae5f9 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -164,6 +164,32 @@ int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
return expiration_timestamp_ms;
}
+InitializeStatsProto::RecoveryCause GetRecoveryCause(
+ const DocumentLogCreator::CreateResult& create_result,
+ bool force_recovery_and_revalidate_documents) {
+ if (force_recovery_and_revalidate_documents) {
+ return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
+ } else if (create_result.log_create_result.has_data_loss()) {
+ return InitializeStatsProto::DATA_LOSS;
+ } else if (create_result.preexisting_file_version !=
+ DocumentLogCreator::kCurrentVersion) {
+ return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
+ }
+ return InitializeStatsProto::NONE;
+}
+
+InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
+ DataLoss data_loss) {
+ switch (data_loss) {
+ case DataLoss::PARTIAL:
+ return InitializeStatsProto::PARTIAL_LOSS;
+ case DataLoss::COMPLETE:
+ return InitializeStatsProto::COMPLETE_LOSS;
+ case DataLoss::NONE:
+ return InitializeStatsProto::NO_DATA_LOSS;
+ }
+}
+
} // namespace
DocumentStore::DocumentStore(const Filesystem* filesystem,
@@ -236,44 +262,34 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
std::move(create_result_or).ValueOrDie();
document_log_ = std::move(create_result.log_create_result.proto_log);
-
- if (create_result.regen_derived_files ||
- force_recovery_and_revalidate_documents ||
- create_result.log_create_result.has_data_loss()) {
+ InitializeStatsProto::RecoveryCause recovery_cause =
+ GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
+
+ if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
+ ICING_LOG(WARNING) << "Starting Document Store Recovery with cause="
+ << recovery_cause << ", and create result { new_file="
+ << create_result.new_file << ", preeisting_file_version="
+ << create_result.preexisting_file_version << ", data_loss="
+ << create_result.log_create_result.data_loss << "} and kCurrentVersion="
+ << DocumentLogCreator::kCurrentVersion;
// We can't rely on any existing derived files. Recreate them from scratch.
// Currently happens if:
// 1) This is a new log and we don't have derived files yet
// 2) Client wanted us to force a regeneration.
// 3) Log has some data loss, can't rely on existing derived data.
- if (create_result.log_create_result.has_data_loss() &&
- initialize_stats != nullptr) {
- ICING_LOG(WARNING)
- << "Data loss in document log, regenerating derived files.";
- initialize_stats->set_document_store_recovery_cause(
- InitializeStatsProto::DATA_LOSS);
-
- if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) {
- // Ground truth is partially lost.
- initialize_stats->set_document_store_data_status(
- InitializeStatsProto::PARTIAL_LOSS);
- } else {
- // Ground truth is completely lost.
- initialize_stats->set_document_store_data_status(
- InitializeStatsProto::COMPLETE_LOSS);
- }
- }
-
std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
libtextclassifier3::Status status =
RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
if (initialize_stats != nullptr &&
- (force_recovery_and_revalidate_documents ||
- create_result.log_create_result.has_data_loss())) {
+ recovery_cause != InitializeStatsProto::NONE) {
// Only consider it a recovery if the client forced a recovery or there
// was data loss. Otherwise, this could just be the first time we're
// initializing and generating derived files.
initialize_stats->set_document_store_recovery_latency_ms(
document_recovery_timer->GetElapsedMilliseconds());
+ initialize_stats->set_document_store_recovery_cause(recovery_cause);
+ initialize_stats->set_document_store_data_status(
+ GetDataStatus(create_result.log_create_result.data_loss));
}
if (!status.ok()) {
ICING_LOG(ERROR)
@@ -282,13 +298,13 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize(
}
} else {
if (!InitializeExistingDerivedFiles().ok()) {
- ICING_VLOG(1)
+ ICING_LOG(WARNING)
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for DocumentStore.";
std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
libtextclassifier3::Status status = RegenerateDerivedFiles(
- /*force_recovery_and_revalidate_documents*/ false);
- if (initialize_stats != nullptr && num_documents() > 0) {
+ /*force_recovery_and_revalidate_documents=*/false);
+ if (initialize_stats != nullptr) {
initialize_stats->set_document_store_recovery_cause(
InitializeStatsProto::IO_ERROR);
initialize_stats->set_document_store_recovery_latency_ms(
@@ -530,7 +546,7 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
document_key_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status =
KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
@@ -540,7 +556,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
return status;
}
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
auto document_key_mapper_or =
KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
@@ -556,7 +572,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
document_id_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
*filesystem_, MakeDocumentIdMapperFilename(base_dir_));
@@ -565,7 +581,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
<< "Failed to delete old document_id mapper";
return status;
}
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
*filesystem_, MakeDocumentIdMapperFilename(base_dir_),
@@ -618,7 +634,7 @@ libtextclassifier3::Status DocumentStore::ResetFilterCache() {
libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
namespace_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete(
*filesystem_, MakeNamespaceMapperFilename(base_dir_));
@@ -638,7 +654,7 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
corpus_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = KeyMapper<CorpusId>::Delete(
*filesystem_, MakeCorpusMapperFilename(base_dir_));
@@ -1749,5 +1765,63 @@ libtextclassifier3::Status DocumentStore::SetUsageScores(
return usage_store_->SetUsageScores(document_id, usage_scores);
}
+libtextclassifier3::StatusOr<
+ google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
+DocumentStore::CollectCorpusInfo() const {
+ google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
+ corpus_info;
+ libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
+ schema_store_->GetSchema();
+ if (!schema_proto_or.ok()) {
+ return corpus_info;
+ }
+ // Maps from CorpusId to the corresponding protocol buffer in the result.
+ std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
+ std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
+ namespace_mapper_->GetValuesToKeys();
+ const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
+ for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
+ ++document_id) {
+ if (!InternalDoesDocumentExist(document_id)) {
+ continue;
+ }
+ ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
+ filter_cache_->Get(document_id));
+ ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
+ score_cache_->Get(document_id));
+ const std::string& name_space =
+ namespace_id_to_namespace[filter_data->namespace_id()];
+ const std::string& schema =
+ schema_proto->types()[filter_data->schema_type_id()].schema_type();
+ auto iter = info_map.find(score_data->corpus_id());
+ if (iter == info_map.end()) {
+ DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
+ entry->set_namespace_(name_space);
+ entry->set_schema(schema);
+ iter = info_map.insert({score_data->corpus_id(), entry}).first;
+ }
+ iter->second->set_total_documents(iter->second->total_documents() + 1);
+ iter->second->set_total_token(iter->second->total_token() +
+ score_data->length_in_tokens());
+ }
+ return corpus_info;
+}
+
+libtextclassifier3::StatusOr<DocumentDebugInfoProto>
+DocumentStore::GetDebugInfo(int verbosity) const {
+ DocumentDebugInfoProto debug_info;
+ *debug_info.mutable_document_storage_info() = GetStorageInfo();
+ ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
+ debug_info.set_crc(crc.Get());
+ if (verbosity > 0) {
+ ICING_ASSIGN_OR_RETURN(google::protobuf::RepeatedPtrField<
+ DocumentDebugInfoProto::CorpusInfo>
+ corpus_info,
+ CollectCorpusInfo());
+ *debug_info.mutable_corpus_info() = std::move(corpus_info);
+ }
+ return debug_info;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index c85c989..e6d2e5c 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -27,6 +27,7 @@
#include "icing/file/file-backed-vector.h"
#include "icing/file/filesystem.h"
#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/document_wrapper.pb.h"
#include "icing/proto/logging.pb.h"
@@ -422,6 +423,17 @@ class DocumentStore {
// INTERNAL_ERROR on compute error
libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
+ // Get debug information for the document store.
+ // verbosity <= 0, simplest debug information
+ // verbosity > 0, also return the total number of documents and tokens in each
+ // (namespace, schema type) pair.
+ //
+ // Returns:
+ // DocumentDebugInfoProto on success
+ // INTERNAL_ERROR on IO errors, crc compute error
+ libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
+ int verbosity) const;
+
private:
// Use DocumentStore::Create() to instantiate.
DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
@@ -696,6 +708,13 @@ class DocumentStore {
// the document_id_mapper somehow became larger than the filter cache.
DocumentStorageInfoProto CalculateDocumentStatusCounts(
DocumentStorageInfoProto storage_info) const;
+
+ // Returns:
+ // - on success, a RepeatedPtrField for CorpusInfo collected.
+ // - OUT_OF_RANGE, this should never happen.
+ libtextclassifier3::StatusOr<google::protobuf::RepeatedPtrField<
+ DocumentDebugInfoProto::CorpusInfo>>
+ CollectCorpusInfo() const;
};
} // namespace lib
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index 78d2f9c..96d11bf 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -29,7 +29,6 @@
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
@@ -45,6 +44,7 @@
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -3170,15 +3170,6 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) {
ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
}
-// TODO(b/185845269) Re-enable this test by copying over a full valid set of
-// document store files. Right now this test only includes the score_cache and
-// the document store header.
-//
-// This causes a problem now because this cl changes behavior to not consider an
-// InitializeExistingDerivedFiles failure to be a recovery if there is nothing
-// to recover because the doocument store is empty.
-#define DISABLE_BACKWARDS_COMPAT_TEST
-#ifndef DISABLE_BACKWARDS_COMPAT_TEST
TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
// The directory testdata/score_cache_without_length_in_tokens/document_store
// contains only the scoring_cache and the document_store_header (holding the
@@ -3194,29 +3185,26 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
// Get src files
std::string document_store_without_length_in_tokens;
- if (IsAndroidPlatform() || IsIosPlatform()) {
+ if (IsAndroidArm() || IsIosPlatform()) {
document_store_without_length_in_tokens = GetTestFilePath(
"icing/testdata/score_cache_without_length_in_tokens/"
"document_store_android_ios_compatible");
+ } else if (IsAndroidX86()) {
+ document_store_without_length_in_tokens = GetTestFilePath(
+ "icing/testdata/score_cache_without_length_in_tokens/"
+ "document_store_android_x86");
} else {
document_store_without_length_in_tokens = GetTestFilePath(
"icing/testdata/score_cache_without_length_in_tokens/"
"document_store");
}
- std::vector<std::string> document_store_files;
Filesystem filesystem;
- filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(),
- &document_store_files);
-
- ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens
- << ' ' << document_store_files.size();
- for (size_t i = 0; i != document_store_files.size(); i++) {
- std::string src = absl_ports::StrCat(
- document_store_without_length_in_tokens, "/", document_store_files[i]);
- std::string dst =
- absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]);
- ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true);
- }
+ ICING_LOG(INFO) << "Copying files "
+ << document_store_without_length_in_tokens;
+ ASSERT_THAT(
+ filesystem.CopyDirectory(document_store_without_length_in_tokens.c_str(),
+ document_store_dir_.c_str(), /*recursive=*/true),
+ true);
InitializeStatsProto initialize_stats;
ICING_ASSERT_OK_AND_ASSIGN(
@@ -3227,12 +3215,11 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
&initialize_stats));
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
- // The store_cache trigger regeneration because its element size is
- // inconsistent: expected 20 (current new size), actual 12 (as per the v0
- // score_cache).
- EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause());
+ // The document log is using the legacy v0 format so that a migration is
+ // needed, which will also trigger regeneration.
+ EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
+ InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
}
-#endif // DISABLE_BACKWARDS_COMPAT_TEST
TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) {
ICING_ASSERT_OK_AND_ASSIGN(
@@ -3422,18 +3409,22 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) {
{
// Create the document store the second time and force recovery
+ InitializeStatsProto initialize_stats;
ICING_ASSERT_OK_AND_ASSIGN(
DocumentStore::CreateResult create_result,
- DocumentStore::Create(
- &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
- /*force_recovery_and_revalidate_documents=*/true));
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/true,
+ &initialize_stats));
std::unique_ptr<DocumentStore> doc_store =
std::move(create_result.document_store);
// Ensure that the type id of the email document has been correctly updated.
ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data,
doc_store->GetDocumentFilterData(docid));
- ASSERT_THAT(filter_data.schema_type_id(), Eq(1));
+ EXPECT_THAT(filter_data.schema_type_id(), Eq(1));
+ EXPECT_THAT(initialize_stats.document_store_recovery_cause(),
+ Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
}
}
@@ -3840,7 +3831,8 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
// Check that we didn't lose anything. A migration also doesn't technically
// count as a recovery.
EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
- EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause());
+ EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
+ InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
// Document 1 and 3 were put normally, and document 2 was deleted in our
// testdata files.
@@ -3862,6 +3854,164 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
IsOkAndHolds(EqualsProto(document3)));
}
+TEST_F(DocumentStoreTest, GetDebugInfo) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(schema));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa bb cc")
+ .AddStringProperty("body", "dd ee")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document1, 5));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "email/2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa bb")
+ .AddStringProperty("body", "cc")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document2, 3));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "email/3")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa")
+ .AddStringProperty("body", "")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document3, 1));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace1", "person/1")
+ .SetSchema("person")
+ .AddStringProperty("name", "test test")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document4, 2));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out1,
+ document_store->GetDebugInfo(/*verbosity=*/1));
+ EXPECT_THAT(out1.crc(), Gt(0));
+ EXPECT_THAT(out1.document_storage_info().num_alive_documents(), Eq(4));
+ EXPECT_THAT(out1.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out1.document_storage_info().num_expired_documents(), Eq(0));
+
+ DocumentDebugInfoProto::CorpusInfo info1, info2, info3;
+ info1.set_namespace_("namespace1");
+ info1.set_schema("email");
+ info1.set_total_documents(1); // document1
+ info1.set_total_token(5);
+
+ info2.set_namespace_("namespace2");
+ info2.set_schema("email");
+ info2.set_total_documents(2); // document2 and document3
+ info2.set_total_token(4); // 3 + 1
+
+ info3.set_namespace_("namespace1");
+ info3.set_schema("person");
+ info3.set_total_documents(1); // document4
+ info3.set_total_token(2);
+
+ EXPECT_THAT(out1.corpus_info(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
+ EqualsProto(info3)));
+
+ // Delete document3.
+ ICING_ASSERT_OK(document_store->Delete("namespace2", "email/3"));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out2,
+ document_store->GetDebugInfo(/*verbosity=*/1));
+ EXPECT_THAT(out2.crc(), Gt(0));
+ EXPECT_THAT(out2.crc(), Not(Eq(out1.crc())));
+ EXPECT_THAT(out2.document_storage_info().num_alive_documents(), Eq(3));
+ EXPECT_THAT(out2.document_storage_info().num_deleted_documents(), Eq(1));
+ EXPECT_THAT(out2.document_storage_info().num_expired_documents(), Eq(0));
+ info2.set_total_documents(1); // document2
+ info2.set_total_token(3);
+ EXPECT_THAT(out2.corpus_info(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
+ EqualsProto(info3)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out3,
+ document_store->GetDebugInfo(/*verbosity=*/0));
+ EXPECT_THAT(out3.corpus_info(), IsEmpty());
+}
+
+TEST_F(DocumentStoreTest, GetDebugInfoWithoutSchema) {
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out,
+ document_store->GetDebugInfo(/*verbosity=*/1));
+ EXPECT_THAT(out.crc(), Gt(0));
+ EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
+ EXPECT_THAT(out.corpus_info(), IsEmpty());
+}
+
+TEST_F(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentDebugInfoProto out,
+ document_store->GetDebugInfo(/*verbosity=*/1));
+ EXPECT_THAT(out.crc(), Gt(0));
+ EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
+ EXPECT_THAT(out.corpus_info(), IsEmpty());
+}
+
} // namespace
} // namespace lib
diff --git a/icing/helpers/icu/icu-data-file-helper.cc b/icing/testing/icu-data-file-helper.cc
index 6607c40..aaeb738 100644
--- a/icing/helpers/icu/icu-data-file-helper.cc
+++ b/icing/testing/icu-data-file-helper.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/icu-data-file-helper.h"
#include <sys/mman.h>
diff --git a/icing/helpers/icu/icu-data-file-helper.h b/icing/testing/icu-data-file-helper.h
index 90f5bc7..d0276e7 100644
--- a/icing/helpers/icu/icu-data-file-helper.h
+++ b/icing/testing/icu-data-file-helper.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
-#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#ifndef ICING_TESTING_ICU_DATA_FILE_HELPER
+#define ICING_TESTING_ICU_DATA_FILE_HELPER
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile(
} // namespace lib
} // namespace icing
-#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#endif // ICING_TESTING_ICU_DATA_FILE_HELPER
diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h
index 3165bf6..fd8d87b 100644
--- a/icing/testing/random-string.h
+++ b/icing/testing/random-string.h
@@ -15,6 +15,7 @@
#ifndef ICING_TESTING_RANDOM_STRING_H_
#define ICING_TESTING_RANDOM_STRING_H_
+#include <algorithm>
#include <random>
#include <string>
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 598ede7..8e0f789 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -59,34 +59,35 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
~IcuLanguageSegmenterIterator() {
ubrk_close(break_iterator_);
- utext_close(&u_text_);
+ utext_close(u_text_);
}
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
- // Prerequisite check
- if (term_end_index_exclusive_ == UBRK_DONE) {
- return false;
- }
+ while (true) {
+ // Prerequisite check
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return false;
+ }
- if (term_end_index_exclusive_ == 0) {
- // First Advance() call
- term_start_index_ = ubrk_first(break_iterator_);
- } else {
- term_start_index_ = term_end_index_exclusive_;
- }
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
+ if (term_end_index_exclusive_ == 0) {
+ // First Advance() call
+ term_start_index_ = ubrk_first(break_iterator_);
+ } else {
+ term_start_index_ = term_end_index_exclusive_;
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
- // Reached the end
- if (term_end_index_exclusive_ == UBRK_DONE) {
- MarkAsDone();
- return false;
- }
+ // Reached the end
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
+ return false;
+ }
- if (!IsValidSegment()) {
- return Advance();
+ if (IsValidSegment()) {
+ return true;
+ }
}
- return true;
}
// Returns the current term. It can be called only when Advance() returns
@@ -253,7 +254,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
: break_iterator_(nullptr),
text_(text),
locale_(locale),
- u_text_(UTEXT_INITIALIZER),
+ u_text_(nullptr),
offset_iterator_(text),
term_start_index_(0),
term_end_index_exclusive_(0) {}
@@ -261,10 +262,13 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Returns true on success
bool Initialize() {
UErrorCode status = U_ZERO_ERROR;
- utext_openUTF8(&u_text_, text_.data(), text_.length(), &status);
+ u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status);
+ if (u_text_ == nullptr) {
+ return false;
+ }
break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
/*textLength=*/0, &status);
- ubrk_setUText(break_iterator_, &u_text_, &status);
+ ubrk_setUText(break_iterator_, u_text_, &status);
return !U_FAILURE(status);
}
@@ -322,8 +326,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
std::string_view locale_;
// A thin wrapper around the input UTF8 text, needed by break_iterator_.
- // utext_close() must be called after using.
- UText u_text_;
+ // Allocated by calling utext_openUtf8() and freed by calling utext_close().
+ UText* u_text_;
// Offset iterator. This iterator is not guaranteed to point to any particular
// character, but is guaranteed to point to a valid UTF character sequence.
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 3090087..fe0b96e 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -21,8 +21,8 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index d293581..3aff45c 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -15,9 +15,9 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index bd86169..6f7d4df 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -14,8 +14,8 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index df98bee..7a1949f 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -66,9 +66,9 @@ class PlainTokenIterator : public Tokenizer::Iterator {
Token GetToken() const override {
if (current_term_.empty()) {
- return Token(Token::INVALID);
+ return Token(Token::Type::INVALID);
}
- return Token(Token::REGULAR, current_term_);
+ return Token(Token::Type::REGULAR, current_term_);
}
libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index f76ba9f..c48b51e 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -18,9 +18,9 @@
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
@@ -68,26 +68,27 @@ TEST_F(PlainTokenizerTest, Simple) {
EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("Hello World"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
EXPECT_THAT(
plain_tokenizer->TokenizeAll(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
"Duis efficitur iaculis auctor."),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"),
- EqualsToken(Token::REGULAR, "ipsum"),
- EqualsToken(Token::REGULAR, "dolor"),
- EqualsToken(Token::REGULAR, "sit"),
- EqualsToken(Token::REGULAR, "amet"),
- EqualsToken(Token::REGULAR, "consectetur"),
- EqualsToken(Token::REGULAR, "adipiscing"),
- EqualsToken(Token::REGULAR, "elit"),
- EqualsToken(Token::REGULAR, "Duis"),
- EqualsToken(Token::REGULAR, "efficitur"),
- EqualsToken(Token::REGULAR, "iaculis"),
- EqualsToken(Token::REGULAR, "auctor"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Lorem"),
+ EqualsToken(Token::Type::REGULAR, "ipsum"),
+ EqualsToken(Token::Type::REGULAR, "dolor"),
+ EqualsToken(Token::Type::REGULAR, "sit"),
+ EqualsToken(Token::Type::REGULAR, "amet"),
+ EqualsToken(Token::Type::REGULAR, "consectetur"),
+ EqualsToken(Token::Type::REGULAR, "adipiscing"),
+ EqualsToken(Token::Type::REGULAR, "elit"),
+ EqualsToken(Token::Type::REGULAR, "Duis"),
+ EqualsToken(Token::Type::REGULAR, "efficitur"),
+ EqualsToken(Token::Type::REGULAR, "iaculis"),
+ EqualsToken(Token::Type::REGULAR, "auctor"))));
}
TEST_F(PlainTokenizerTest, Whitespace) {
@@ -107,16 +108,18 @@ TEST_F(PlainTokenizerTest, Whitespace) {
// 0x0009 is horizontal tab, considered as a whitespace
std::string text_with_horizontal_tab =
absl_ports::StrCat("Hello", UCharToString(0x0009), "World");
- EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
// 0x000B is vertical tab, considered as a whitespace
std::string text_with_vertical_tab =
absl_ports::StrCat("Hello", UCharToString(0x000B), "World");
- EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(text_with_vertical_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
}
TEST_F(PlainTokenizerTest, Punctuation) {
@@ -131,38 +134,39 @@ TEST_F(PlainTokenizerTest, Punctuation) {
language_segmenter.get()));
// Half-width punctuation marks are filtered out.
- EXPECT_THAT(plain_tokenizer->TokenizeAll(
- "Hello, World! Hello: World. \"Hello\" World?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"),
- EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"),
- EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(
+ "Hello, World! Hello: World. \"Hello\" World?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
// Full-width punctuation marks are filtered out.
std::vector<std::string_view> exp_tokens;
if (IsCfStringTokenization()) {
EXPECT_THAT(
plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你"),
- EqualsToken(Token::REGULAR, "好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你"),
- EqualsToken(Token::REGULAR, "好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你"),
- EqualsToken(Token::REGULAR, "好"),
- EqualsToken(Token::REGULAR, "世界"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"))));
} else {
EXPECT_THAT(
plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"))));
}
}
@@ -180,14 +184,16 @@ TEST_F(PlainTokenizerTest, SpecialCharacters) {
// Right now we don't have special logic for these characters, just output
// them as tokens.
- EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"),
- EqualsToken(Token::REGULAR, "+"),
- EqualsToken(Token::REGULAR, "1"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("1+1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "1"),
+ EqualsToken(Token::Type::REGULAR, "+"),
+ EqualsToken(Token::Type::REGULAR, "1"))));
- EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"),
- EqualsToken(Token::REGULAR, "50"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("$50"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "$"),
+ EqualsToken(Token::Type::REGULAR, "50"))));
}
TEST_F(PlainTokenizerTest, CJKT) {
@@ -203,12 +209,13 @@ TEST_F(PlainTokenizerTest, CJKT) {
tokenizer_factory::CreateIndexingTokenizer(
StringIndexingConfig::TokenizerType::PLAIN,
language_segmenter.get()));
- EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"),
- EqualsToken(Token::REGULAR, "每天"),
- EqualsToken(Token::REGULAR, "走路"),
- EqualsToken(Token::REGULAR, "去"),
- EqualsToken(Token::REGULAR, "上班"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "我"),
+ EqualsToken(Token::Type::REGULAR, "每天"),
+ EqualsToken(Token::Type::REGULAR, "走路"),
+ EqualsToken(Token::Type::REGULAR, "去"),
+ EqualsToken(Token::Type::REGULAR, "上班"))));
// Japanese
options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE,
jni_cache_.get());
@@ -220,41 +227,44 @@ TEST_F(PlainTokenizerTest, CJKT) {
StringIndexingConfig::TokenizerType::PLAIN,
language_segmenter.get()));
if (IsCfStringTokenization()) {
- EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"),
- EqualsToken(Token::REGULAR, "は"),
- EqualsToken(Token::REGULAR, "毎日"),
- EqualsToken(Token::REGULAR, "仕事"),
- EqualsToken(Token::REGULAR, "に"),
- EqualsToken(Token::REGULAR, "歩い"),
- EqualsToken(Token::REGULAR, "て"),
- EqualsToken(Token::REGULAR, "い"),
- EqualsToken(Token::REGULAR, "ます"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
+ EqualsToken(Token::Type::REGULAR, "は"),
+ EqualsToken(Token::Type::REGULAR, "毎日"),
+ EqualsToken(Token::Type::REGULAR, "仕事"),
+ EqualsToken(Token::Type::REGULAR, "に"),
+ EqualsToken(Token::Type::REGULAR, "歩い"),
+ EqualsToken(Token::Type::REGULAR, "て"),
+ EqualsToken(Token::Type::REGULAR, "い"),
+ EqualsToken(Token::Type::REGULAR, "ます"))));
} else {
- EXPECT_THAT(plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "私"),
- EqualsToken(Token::REGULAR, "は"),
- EqualsToken(Token::REGULAR, "毎日"),
- EqualsToken(Token::REGULAR, "仕事"),
- EqualsToken(Token::REGULAR, "に"),
- EqualsToken(Token::REGULAR, "歩"),
- EqualsToken(Token::REGULAR, "い"),
- EqualsToken(Token::REGULAR, "てい"),
- EqualsToken(Token::REGULAR, "ます"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
+ EqualsToken(Token::Type::REGULAR, "は"),
+ EqualsToken(Token::Type::REGULAR, "毎日"),
+ EqualsToken(Token::Type::REGULAR, "仕事"),
+ EqualsToken(Token::Type::REGULAR, "に"),
+ EqualsToken(Token::Type::REGULAR, "歩"),
+ EqualsToken(Token::Type::REGULAR, "い"),
+ EqualsToken(Token::Type::REGULAR, "てい"),
+ EqualsToken(Token::Type::REGULAR, "ます"))));
}
// Khmer
- EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"),
- EqualsToken(Token::REGULAR, "ដើរទៅ"),
- EqualsToken(Token::REGULAR, "ធ្វើការ"),
- EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ"))));
- // Korean
EXPECT_THAT(
- plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"),
- EqualsToken(Token::REGULAR, "매일"),
- EqualsToken(Token::REGULAR, "출근합니다"))));
+ plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ញុំ"),
+ EqualsToken(Token::Type::REGULAR, "ដើរទៅ"),
+ EqualsToken(Token::Type::REGULAR, "ធ្វើការ"),
+ EqualsToken(Token::Type::REGULAR, "រាល់ថ្ងៃ"))));
+ // Korean
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::REGULAR, "나는"),
+ EqualsToken(Token::Type::REGULAR, "매일"),
+ EqualsToken(Token::Type::REGULAR, "출근합니다"))));
// Thai
// DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
@@ -264,19 +274,20 @@ TEST_F(PlainTokenizerTest, CJKT) {
std::vector<Token> tokens,
plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"));
- EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
- EqualsToken(Token::REGULAR, "เดิน"),
- EqualsToken(Token::REGULAR, "ไป"),
- EqualsToken(Token::REGULAR, "ทำงาน"),
- EqualsToken(Token::REGULAR, "ทุกวัน")));
+ EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
+ EqualsToken(Token::Type::REGULAR, "เดิน"),
+ EqualsToken(Token::Type::REGULAR, "ไป"),
+ EqualsToken(Token::Type::REGULAR, "ทำงาน"),
+ EqualsToken(Token::Type::REGULAR, "ทุกวัน")));
} else {
- EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
- EqualsToken(Token::REGULAR, "เดิน"),
- EqualsToken(Token::REGULAR, "ไป"),
- EqualsToken(Token::REGULAR, "ทำงาน"),
- EqualsToken(Token::REGULAR, "ทุก"),
- EqualsToken(Token::REGULAR, "วัน"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
+ EqualsToken(Token::Type::REGULAR, "เดิน"),
+ EqualsToken(Token::Type::REGULAR, "ไป"),
+ EqualsToken(Token::Type::REGULAR, "ทำงาน"),
+ EqualsToken(Token::Type::REGULAR, "ทุก"),
+ EqualsToken(Token::Type::REGULAR, "วัน"))));
}
}
@@ -295,7 +306,7 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0));
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b"));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "b"));
EXPECT_FALSE(iterator->ResetToTokenStartingAfter(2));
}
@@ -315,7 +326,7 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2));
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f"));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "f"));
EXPECT_FALSE(iterator->ResetToTokenEndingBefore(0));
}
@@ -332,11 +343,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
- EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
- EqualsToken(Token::REGULAR, "bar"),
- EqualsToken(Token::REGULAR, "baz"),
- EqualsToken(Token::REGULAR, "bat"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
+ EqualsToken(Token::Type::REGULAR, "bar"),
+ EqualsToken(Token::Type::REGULAR, "baz"),
+ EqualsToken(Token::Type::REGULAR, "bat"))));
std::vector<std::string> expected_text = {
"foo", // 0: " foo . bar"
"bar", // 1: "foo . bar "
@@ -359,12 +371,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->Advance());
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo"));
for (int i = 0; i < kText.length(); ++i) {
if (i < expected_text.size()) {
EXPECT_TRUE(iterator->ResetToTokenStartingAfter(i));
EXPECT_THAT(iterator->GetToken(),
- EqualsToken(Token::REGULAR, expected_text[i]));
+ EqualsToken(Token::Type::REGULAR, expected_text[i]));
} else {
EXPECT_FALSE(iterator->ResetToTokenStartingAfter(i));
}
@@ -383,11 +395,12 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
- EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
- EqualsToken(Token::REGULAR, "bar"),
- EqualsToken(Token::REGULAR, "baz"),
- EqualsToken(Token::REGULAR, "bat"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
+ EqualsToken(Token::Type::REGULAR, "bar"),
+ EqualsToken(Token::Type::REGULAR, "baz"),
+ EqualsToken(Token::Type::REGULAR, "bat"))));
std::vector<std::string> expected_text = {
"bat", // 20: "baz.. bat "
"baz", // 19: " baz.. bat"
@@ -410,13 +423,14 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->Advance());
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+ EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::Type::REGULAR, "foo"));
for (int i = kText.length() - 1; i >= 0; --i) {
int expected_index = kText.length() - 1 - i;
if (expected_index < expected_text.size()) {
EXPECT_TRUE(iterator->ResetToTokenEndingBefore(i));
- EXPECT_THAT(iterator->GetToken(),
- EqualsToken(Token::REGULAR, expected_text[expected_index]));
+ EXPECT_THAT(
+ iterator->GetToken(),
+ EqualsToken(Token::Type::REGULAR, expected_text[expected_index]));
} else {
EXPECT_FALSE(iterator->ResetToTokenEndingBefore(i));
}
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 2d461ee..8a27103 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -422,7 +422,7 @@ std::pair<TermType, std::string_view> GetTerm(std::string_view text,
// and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no
// valid token on its right.
void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) {
- if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) {
+ if (!tokens->empty() && tokens->back().type == Token::Type::QUERY_OR) {
tokens->pop_back();
}
}
@@ -436,11 +436,11 @@ libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) {
}
Token::Type last_token_type = tokens->back().type;
switch (last_token_type) {
- case Token::REGULAR:
- case Token::QUERY_RIGHT_PARENTHESES:
- tokens->emplace_back(Token::QUERY_OR);
+ case Token::Type::REGULAR:
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
+ tokens->emplace_back(Token::Type::QUERY_OR);
break;
- case Token::QUERY_OR:
+ case Token::Type::QUERY_OR:
// Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2"
break;
default:
@@ -481,21 +481,21 @@ libtextclassifier3::Status OutputToken(State new_state,
GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME));
}
}
- tokens->emplace_back(Token::QUERY_PROPERTY, current_term);
+ tokens->emplace_back(Token::Type::QUERY_PROPERTY, current_term);
} else {
- tokens->emplace_back(Token::REGULAR, current_term);
+ tokens->emplace_back(Token::Type::REGULAR, current_term);
}
break;
case LEFT_PARENTHESES:
- tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES);
+ tokens->emplace_back(Token::Type::QUERY_LEFT_PARENTHESES);
break;
case RIGHT_PARENTHESES:
// Ignores "OR" if it's followed by right parentheses.
RemoveLastTokenIfOrOperator(tokens);
- tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES);
+ tokens->emplace_back(Token::Type::QUERY_RIGHT_PARENTHESES);
break;
case EXCLUSION_OPERATOR:
- tokens->emplace_back(Token::QUERY_EXCLUSION);
+ tokens->emplace_back(Token::Type::QUERY_EXCLUSION);
break;
case OR_OPERATOR:
return OutputOrOperatorToken(tokens);
@@ -648,7 +648,7 @@ class RawQueryTokenIterator : public Tokenizer::Iterator {
Token GetToken() const override {
if (current_ < 0 || current_ >= tokens_.size()) {
- return Token(Token::INVALID);
+ return Token(Token::Type::INVALID);
}
return tokens_.at(current_);
}
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 500efa0..c6d981d 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -16,9 +16,9 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
@@ -59,13 +59,15 @@ TEST_F(RawQueryTokenizerTest, Simple) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("Hello World!"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "hElLo"),
- EqualsToken(Token::REGULAR, "WORLD"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "hElLo"),
+ EqualsToken(Token::Type::REGULAR, "WORLD"))));
}
TEST_F(RawQueryTokenizerTest, Parentheses) {
@@ -80,82 +82,82 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term3"),
- EqualsToken(Token::REGULAR, "term4"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term3"),
+ EqualsToken(Token::Type::REGULAR, "term4"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
-
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)-term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)-term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -180,44 +182,49 @@ TEST_F(RawQueryTokenizerTest, Exclustion) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("- term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// Exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1- term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// Exclusion operator is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// First exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("--term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// First "-" is exclusion operator, second is not and will be discarded.
// In other words, exclusion only applies to the term right after it.
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1-term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -249,73 +256,75 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Colon is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll(":term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// Colon is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Colon is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1:"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// property name can be a path
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"),
- IsOkAndHolds(
- ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"),
- EqualsToken(Token::REGULAR, "hello"))));
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "email.title"),
+ EqualsToken(Token::Type::REGULAR, "hello"))));
// The first colon ":" triggers property restriction, the second colon is used
// as a word connector per ICU's rule
// (https://unicode.org/reports/tr29/#Word_Boundaries).
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property:foo:bar"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"),
- EqualsToken(Token::REGULAR, "foo:bar"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property"),
+ EqualsToken(Token::Type::REGULAR, "foo:bar"))));
// Property restriction only applies to the term right after it.
// Note: "term1:term2" is not a term but 2 terms because word connectors
// don't apply to numbers and alphabets.
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::REGULAR, "天气"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1-"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1-"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// Multiple continuous colons will still be recognized as a property
// restriction operator
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1::term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1::term1"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("property1:(term1)"),
@@ -345,105 +354,109 @@ TEST_F(RawQueryTokenizerTest, OR) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// Two continuous "OR"s are treated as one
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
-
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Only "OR" (all in uppercase) is the operator
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "or"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::REGULAR, "Or"),
- EqualsToken(Token::REGULAR, "term3"),
- EqualsToken(Token::REGULAR, "oR"),
- EqualsToken(Token::REGULAR, "term4"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "or"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::REGULAR, "Or"),
+ EqualsToken(Token::Type::REGULAR, "term3"),
+ EqualsToken(Token::Type::REGULAR, "oR"),
+ EqualsToken(Token::Type::REGULAR, "term4"))));
// "OR" is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("OR term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// "OR" is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 OR"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("term1 OR-term2"),
@@ -472,31 +485,31 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
if (IsCfStringTokenization()) {
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("-今天天气很好"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::REGULAR, "天气"),
- EqualsToken(Token::REGULAR, "很"),
- EqualsToken(Token::REGULAR, "好"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"),
+ EqualsToken(Token::Type::REGULAR, "很"),
+ EqualsToken(Token::Type::REGULAR, "好"))));
} else {
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("-今天天气很好"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::REGULAR, "天气"),
- EqualsToken(Token::REGULAR, "很好"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"),
+ EqualsToken(Token::Type::REGULAR, "很好"))));
}
if (IsCfStringTokenization()) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
- IsOkAndHolds(
- ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "你"),
- EqualsToken(Token::REGULAR, "好"))));
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"))));
} else {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
- IsOkAndHolds(
- ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "你好"))));
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "你好"))));
}
EXPECT_THAT(
@@ -504,10 +517,11 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
HasSubstr("Characters in property name must all be ASCII")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "ねこ"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "cat"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "ねこ"))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("cat ORねこ"),
@@ -543,40 +557,45 @@ TEST_F(RawQueryTokenizerTest, OtherChars) {
language_segmenter.get()));
// Comma is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll(",term1, ,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Exclusion operator and comma are ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// Colon and comma are ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
-
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ raw_query_tokenizer->TokenizeAll("property1:,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// This is a special case for OR, unknown chars are treated the same as
// whitespaces before and after OR.
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
}
TEST_F(RawQueryTokenizerTest, Mix) {
@@ -593,37 +612,38 @@ TEST_F(RawQueryTokenizerTest, Mix) {
EXPECT_THAT(raw_query_tokenizer->TokenizeAll(
"こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "こんにちは"),
- EqualsToken(Token::REGULAR, "good"),
- EqualsToken(Token::REGULAR, "afternoon"),
- EqualsToken(Token::QUERY_PROPERTY, "title"),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "ใน"),
- EqualsToken(Token::REGULAR, "วันนี้"),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "B12"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "こんにちは"),
+ EqualsToken(Token::Type::REGULAR, "good"),
+ EqualsToken(Token::Type::REGULAR, "afternoon"),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "ใน"),
+ EqualsToken(Token::Type::REGULAR, "วันนี้"),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "B12"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
} else {
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<Token> tokens,
raw_query_tokenizer->TokenizeAll(
"こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"));
- EXPECT_THAT(tokens,
- ElementsAre(EqualsToken(Token::REGULAR, "こんにちは"),
- EqualsToken(Token::REGULAR, "good"),
- EqualsToken(Token::REGULAR, "afternoon"),
- EqualsToken(Token::QUERY_PROPERTY, "title"),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "ใน"),
- EqualsToken(Token::REGULAR, "วัน"),
- EqualsToken(Token::REGULAR, "นี้"),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "B12"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, "")));
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "こんにちは"),
+ EqualsToken(Token::Type::REGULAR, "good"),
+ EqualsToken(Token::Type::REGULAR, "afternoon"),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "ใน"),
+ EqualsToken(Token::Type::REGULAR, "วัน"),
+ EqualsToken(Token::Type::REGULAR, "นี้"),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "B12"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
}
}
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index b936f2b..cb474c6 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -43,45 +43,46 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
- // Prerequisite check
- if (IsDone()) {
- return false;
- }
+ while (true) {
+ // Prerequisite check
+ if (IsDone()) {
+ return false;
+ }
- if (term_end_exclusive_.utf16_index() == 0) {
- int first = break_iterator_->First();
- if (!term_start_.MoveToUtf16(first)) {
- // First is guaranteed to succeed and return a position within bonds. So
- // the only possible failure could be an invalid sequence. Mark as DONE
- // and return.
+ if (term_end_exclusive_.utf16_index() == 0) {
+ int first = break_iterator_->First();
+ if (!term_start_.MoveToUtf16(first)) {
+ // First is guaranteed to succeed and return a position within bonds.
+ // So the only possible failure could be an invalid sequence. Mark as
+ // DONE and return.
+ MarkAsDone();
+ return false;
+ }
+ } else {
+ term_start_ = term_end_exclusive_;
+ }
+
+ int next_utf16_index_exclusive = break_iterator_->Next();
+ // Reached the end
+ if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
+ MarkAsDone();
+ return false;
+ }
+ if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
+ // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
+ // the check for kDone above. So the only possible failure could be an
+ // invalid sequence. Mark as DONE and return.
MarkAsDone();
return false;
}
- } else {
- term_start_ = term_end_exclusive_;
- }
-
- int next_utf16_index_exclusive = break_iterator_->Next();
- // Reached the end
- if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
- MarkAsDone();
- return false;
- }
- if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
- // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
- // the check for kDone above. So the only possible failure could be an
- // invalid sequence. Mark as DONE and return.
- MarkAsDone();
- return false;
- }
- // Check if the current term is valid. We consider any term valid if its
- // first character is valid. If it's not valid, then we need to advance to
- // the next term.
- if (IsValidTerm()) {
- return true;
+ // Check if the current term is valid. We consider any term valid if its
+ // first character is valid. If it's not valid, then we need to advance to
+ // the next term.
+ if (IsValidTerm()) {
+ return true;
+ }
}
- return Advance();
}
// Returns the current term. It can be called only when Advance() returns
diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h
index b97ee0b..0c268be 100644
--- a/icing/tokenization/token.h
+++ b/icing/tokenization/token.h
@@ -21,7 +21,7 @@ namespace icing {
namespace lib {
struct Token {
- enum Type {
+ enum class Type {
// Common types
REGULAR, // A token without special meanings, the value of it will be
// indexed or searched directly
diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc
index 173486a..dc54696 100644
--- a/icing/tokenization/verbatim-tokenizer.cc
+++ b/icing/tokenization/verbatim-tokenizer.cc
@@ -37,10 +37,10 @@ class VerbatimTokenIterator : public Tokenizer::Iterator {
Token GetToken() const override {
if (term_.empty() || !has_advanced_to_end_) {
- return Token(Token::INVALID);
+ return Token(Token::Type::INVALID);
}
- return Token(Token::VERBATIM, term_);
+ return Token(Token::Type::VERBATIM, term_);
}
libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc
index b6bc5fa..e38c7aa 100644
--- a/icing/tokenization/verbatim-tokenizer_test.cc
+++ b/icing/tokenization/verbatim-tokenizer_test.cc
@@ -15,9 +15,9 @@
#include <string_view>
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
@@ -71,7 +71,7 @@ TEST_F(VerbatimTokenizerTest, Simple) {
EXPECT_THAT(
verbatim_tokenizer->TokenizeAll("foo bar"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::VERBATIM, "foo bar"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::VERBATIM, "foo bar"))));
}
TEST_F(VerbatimTokenizerTest, Punctuation) {
@@ -80,9 +80,9 @@ TEST_F(VerbatimTokenizerTest, Punctuation) {
StringIndexingConfig::TokenizerType::VERBATIM,
language_segmenter_.get()));
- EXPECT_THAT(
- verbatim_tokenizer->TokenizeAll("Hello, world!"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::VERBATIM, "Hello, world!"))));
+ EXPECT_THAT(verbatim_tokenizer->TokenizeAll("Hello, world!"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"))));
}
TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) {
@@ -95,7 +95,8 @@ TEST_F(VerbatimTokenizerTest, InvalidTokenBeforeAdvancing) {
auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
// We should get an invalid token if we get the token before advancing.
- EXPECT_THAT(token_iterator->GetToken(), EqualsToken(Token::INVALID, ""));
+ EXPECT_THAT(token_iterator->GetToken(),
+ EqualsToken(Token::Type::INVALID, ""));
}
TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) {
@@ -111,13 +112,13 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) {
// is larger than the final index (12) of the verbatim token.
EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::VERBATIM, "Hello, world!"));
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
// Ensure our cached character iterator propertly maintains the end of the
// verbatim token.
EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::VERBATIM, "Hello, world!"));
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
// We should not be able to reset with an offset before or within
// the verbatim token's utf-32 length.
@@ -137,7 +138,7 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) {
// Get token without resetting
EXPECT_TRUE(token_iterator->Advance());
EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::VERBATIM, "Hello, world!"));
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
// We expect a sole verbatim token, so it's not possible to reset after the
// start of the token.
@@ -147,7 +148,7 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) {
// negative.
EXPECT_TRUE(token_iterator->ResetToTokenStartingAfter(-1));
EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::VERBATIM, "Hello, world!"));
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
}
TEST_F(VerbatimTokenizerTest, ResetToStart) {
@@ -162,12 +163,12 @@ TEST_F(VerbatimTokenizerTest, ResetToStart) {
// Get token without resetting
EXPECT_TRUE(token_iterator->Advance());
EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::VERBATIM, "Hello, world!"));
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
// Retrieve token again after resetting to start
EXPECT_TRUE(token_iterator->ResetToStart());
EXPECT_THAT(token_iterator->GetToken(),
- EqualsToken(Token::VERBATIM, "Hello, world!"));
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"));
}
TEST_F(VerbatimTokenizerTest, CalculateTokenStart) {
diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index 8d09be2..fdd4c70 100644
--- a/icing/transform/icu/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -14,8 +14,8 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index a46fcc7..143da17 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -16,8 +16,8 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
diff --git a/proto/icing/proto/debug.proto b/proto/icing/proto/debug.proto
new file mode 100644
index 0000000..3f07539
--- /dev/null
+++ b/proto/icing/proto/debug.proto
@@ -0,0 +1,112 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+import "icing/proto/status.proto";
+import "icing/proto/storage.proto";
+
+option java_package = "com.google.android.icing.proto";
+option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
+// Next tag: 4
+message IndexDebugInfoProto {
+ // Storage information of the index.
+ optional IndexStorageInfoProto index_storage_info = 1;
+
+ message MainIndexDebugInfoProto {
+ // Information about the main lexicon.
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+ optional string lexicon_info = 1;
+
+ // Last added document id.
+ optional uint32 last_added_document_id = 2;
+
+ // If verbosity > 0, return information about the posting list storage.
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+ optional string flash_index_storage_info = 3;
+ }
+ optional MainIndexDebugInfoProto main_index_info = 2;
+
+ message LiteIndexDebugInfoProto {
+ // Current number of hits.
+ optional uint32 curr_size = 1;
+
+ // The maximum possible number of hits.
+ optional uint32 hit_buffer_size = 2;
+
+ // Last added document id.
+ optional uint32 last_added_document_id = 3;
+
+ // The first position in the hit buffer that is not sorted yet,
+ // or curr_size if all hits are sorted.
+ optional uint32 searchable_end = 4;
+
+ // The most recent checksum of the lite index, by calling
+ // LiteIndex::ComputeChecksum().
+ optional uint32 index_crc = 5;
+
+ // Information about the lite lexicon.
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+ optional string lexicon_info = 6;
+ }
+ optional LiteIndexDebugInfoProto lite_index_info = 3;
+}
+
+// Next tag: 4
+message DocumentDebugInfoProto {
+ // Storage information of the document store.
+ optional DocumentStorageInfoProto document_storage_info = 1;
+
+ // The most recent checksum of the document store, by calling
+ // DocumentStore::ComputeChecksum().
+ optional uint32 crc = 2;
+
+ message CorpusInfo {
+ optional string namespace = 1;
+ optional string schema = 2;
+ optional uint32 total_documents = 3;
+ optional uint32 total_token = 4;
+ }
+
+ // If verbosity > 0, return the total number of documents and tokens in each
+ // (namespace, schema type) pair.
+ // Note that deleted and expired documents are skipped in the output.
+ repeated CorpusInfo corpus_info = 3;
+}
+
+// Next tag: 3
+message DebugInfoProto {
+ // Debug information of the index.
+ optional IndexDebugInfoProto index_info = 1;
+
+ // Debug information of the document store.
+ optional DocumentDebugInfoProto document_info = 2;
+}
+
+// Next tag: 3
+message DebugInfoResultProto {
+ // Status code can be one of:
+ // OK
+ // FAILED_PRECONDITION
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+
+ // Debug information for Icing.
+ optional DebugInfoProto debug_info = 2;
+}
diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto
index 2e8321b..1a501e7 100644
--- a/proto/icing/proto/document.proto
+++ b/proto/icing/proto/document.proto
@@ -209,7 +209,7 @@ message DeleteBySchemaTypeResultProto {
}
// Result of a call to IcingSearchEngine.DeleteByQuery
-// Next tag: 4
+// Next tag: 5
message DeleteByQueryResultProto {
// Status code can be one of:
// OK
@@ -226,5 +226,18 @@ message DeleteByQueryResultProto {
// Stats for delete execution performance.
optional DeleteByQueryStatsProto delete_by_query_stats = 3;
+ // Used by DeleteByQueryResultProto to return information about deleted
+ // documents.
+ message DocumentGroupInfo {
+ optional string namespace = 1;
+ optional string schema = 2;
+ repeated string uris = 3;
+ }
+
+ // Additional return message that shows the uris of the deleted documents, if
+ // users set return_deleted_document_info to true.
+ // The result is grouped by the corresponding namespace and type.
+ repeated DocumentGroupInfo deleted_documents = 4;
+
reserved 2;
}
diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto
index 2f1f271..0a7c4a6 100644
--- a/proto/icing/proto/logging.proto
+++ b/proto/icing/proto/logging.proto
@@ -46,6 +46,9 @@ message InitializeStatsProto {
// Random I/O errors.
IO_ERROR = 4;
+
+ // The document log is using legacy format.
+ LEGACY_DOCUMENT_LOG_FORMAT = 5;
}
// Possible recovery causes for document store:
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 26469ab..83c7099 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=417867838)
+set(synced_AOSP_CL_number=435140515)