aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2023-03-06 17:26:23 -0800
committerTim Barron <tjbarron@google.com>2023-03-06 17:26:23 -0800
commit53032446fec369125a6dc47c9f66435e4a62410b (patch)
tree779ba434530b8af387bc1e2f9f37aca52e298b1d
parent49064c458678781fbf3db256751658728dc87740 (diff)
downloadicing-53032446fec369125a6dc47c9f66435e4a62410b.tar.gz
Update Icing from upstream.
Descriptions: ====================================================================== Refactor IndexProcessor ====================================================================== Rename Joinable Cache as Joinable Index ====================================================================== Implement Optimize and Clear for QualifiedIdTypeJoinableCache ====================================================================== Add JoinablePropertyMetadata reverse lookup ====================================================================== Allow code creating LanguageSegmenter::Iterators to declare AccessType ====================================================================== Further codifies the escape behavior in the parser test ====================================================================== Bug: 263890397 Bug: 268680462 Bug: 270102295 Change-Id: I3233733b40e985e11c4a6d75c1528cd6a72c1173
-rw-r--r--icing/icing-search-engine.cc58
-rw-r--r--icing/icing-search-engine.h7
-rw-r--r--icing/index/data-indexing-handler.h (renamed from icing/index/section-indexing-handler.h)25
-rw-r--r--icing/index/index-processor.cc29
-rw-r--r--icing/index/index-processor.h43
-rw-r--r--icing/index/index-processor_benchmark.cc73
-rw-r--r--icing/index/index-processor_test.cc94
-rw-r--r--icing/index/integer-section-indexing-handler.cc16
-rw-r--r--icing/index/integer-section-indexing-handler.h27
-rw-r--r--icing/index/numeric/integer-index.h2
-rw-r--r--icing/index/string-section-indexing-handler.cc14
-rw-r--r--icing/index/string-section-indexing-handler.h32
-rw-r--r--icing/join/qualified-id-type-joinable-cache_test.cc496
-rw-r--r--icing/join/qualified-id-type-joinable-index.cc (renamed from icing/join/qualified-id-type-joinable-cache.cc)174
-rw-r--r--icing/join/qualified-id-type-joinable-index.h (renamed from icing/join/qualified-id-type-joinable-cache.h)106
-rw-r--r--icing/join/qualified-id-type-joinable-index_test.cc739
-rw-r--r--icing/query/advanced_query_parser/lexer_test.cc13
-rw-r--r--icing/query/advanced_query_parser/query-visitor.cc6
-rw-r--r--icing/result/snippet-retriever.cc5
-rw-r--r--icing/schema/joinable-property-manager.cc51
-rw-r--r--icing/schema/joinable-property-manager.h42
-rw-r--r--icing/schema/joinable-property-manager_test.cc88
-rw-r--r--icing/schema/schema-store.cc16
-rw-r--r--icing/schema/schema-store.h35
-rw-r--r--icing/schema/schema-store_test.cc5
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc5
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.h2
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc182
-rw-r--r--icing/tokenization/language-segmenter-iterator_test.cc51
-rw-r--r--icing/tokenization/language-segmenter.h7
-rw-r--r--icing/tokenization/language-segmenter_benchmark.cc15
-rw-r--r--icing/tokenization/plain-tokenizer.cc10
-rw-r--r--icing/tokenization/plain-tokenizer.h3
-rw-r--r--icing/tokenization/plain-tokenizer_test.cc33
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc3
-rw-r--r--icing/tokenization/raw-query-tokenizer.h2
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc7
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc21
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-break-iterator.h14
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc18
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h3
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc184
-rw-r--r--icing/tokenization/rfc822-tokenizer.cc7
-rw-r--r--icing/tokenization/rfc822-tokenizer.h3
-rw-r--r--icing/tokenization/rfc822-tokenizer_test.cc16
-rw-r--r--icing/tokenization/tokenizer.h21
-rw-r--r--icing/tokenization/verbatim-tokenizer.cc7
-rw-r--r--icing/tokenization/verbatim-tokenizer.h3
-rw-r--r--icing/tokenization/verbatim-tokenizer_test.cc31
-rw-r--r--icing/util/tokenized-document.cc6
-rw-r--r--synced_AOSP_CL_number.txt2
51 files changed, 1918 insertions, 934 deletions
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 5321d42..1b193af 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -32,11 +32,14 @@
#include "icing/file/destructible-file.h"
#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
+#include "icing/index/integer-section-indexing-handler.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/numeric/integer-index.h"
+#include "icing/index/string-section-indexing-handler.h"
#include "icing/join/join-processor.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/portable/endian.h"
@@ -982,16 +985,15 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
}
DocumentId document_id = document_id_or.ValueOrDie();
- auto index_processor_or = IndexProcessor::Create(
- normalizer_.get(), index_.get(), integer_index_.get(), clock_.get());
- if (!index_processor_or.ok()) {
- TransformStatus(index_processor_or.status(), result_status);
+ auto data_indexing_handlers_or = CreateDataIndexingHandlers();
+ if (!data_indexing_handlers_or.ok()) {
+ TransformStatus(data_indexing_handlers_or.status(), result_status);
return result_proto;
}
- std::unique_ptr<IndexProcessor> index_processor =
- std::move(index_processor_or).ValueOrDie();
+ IndexProcessor index_processor(
+ std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get());
- auto index_status = index_processor->IndexDocument(
+ auto index_status = index_processor.IndexDocument(
tokenized_document, document_id, put_document_stats);
// Getting an internal error from the index could possibly mean that the index
// is broken. Try to rebuild the index to recover.
@@ -2119,19 +2121,18 @@ IcingSearchEngine::RestoreIndexIfNeeded() {
return {libtextclassifier3::Status::OK, false, false};
}
+ auto data_indexing_handlers_or = CreateDataIndexingHandlers();
+ if (!data_indexing_handlers_or.ok()) {
+ return {data_indexing_handlers_or.status(),
+ truncate_result.index_needed_restoration,
+ truncate_result.integer_index_needed_restoration};
+ }
// By using recovery_mode for IndexProcessor, we're able to replay documents
// from smaller document id and it will skip documents that are already been
// indexed.
- auto index_processor_or = IndexProcessor::Create(
- normalizer_.get(), index_.get(), integer_index_.get(), clock_.get(),
+ IndexProcessor index_processor(
+ std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get(),
/*recovery_mode=*/true);
- if (!index_processor_or.ok()) {
- return {index_processor_or.status(),
- truncate_result.index_needed_restoration,
- truncate_result.integer_index_needed_restoration};
- }
- std::unique_ptr<IndexProcessor> index_processor =
- std::move(index_processor_or).ValueOrDie();
ICING_VLOG(1) << "Restoring index by replaying documents from document id "
<< truncate_result.first_document_to_reindex
@@ -2168,7 +2169,7 @@ IcingSearchEngine::RestoreIndexIfNeeded() {
std::move(tokenized_document_or).ValueOrDie());
libtextclassifier3::Status status =
- index_processor->IndexDocument(tokenized_document, document_id);
+ index_processor.IndexDocument(tokenized_document, document_id);
if (!status.ok()) {
if (!absl_ports::IsDataLoss(status)) {
// Real error. Stop recovering and pass it up.
@@ -2209,6 +2210,29 @@ libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
return document_store_->last_added_document_id() != kInvalidDocumentId;
}
+libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
+IcingSearchEngine::CreateDataIndexingHandlers() {
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+
+ // Term index handler
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler,
+ StringSectionIndexingHandler::Create(
+ clock_.get(), normalizer_.get(), index_.get()));
+ handlers.push_back(std::move(string_section_indexing_handler));
+
+ // Integer index handler
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(
+ clock_.get(), integer_index_.get()));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+
+ // TODO(b/263890397): add QualifiedIdJoinablePropertyIndexingHandler
+
+ return handlers;
+}
+
libtextclassifier3::StatusOr<IcingSearchEngine::TruncateIndexResult>
IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) {
// Attempt to truncate term index.
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index da447d5..678fc77 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -26,6 +26,7 @@
#include "icing/absl_ports/mutex.h"
#include "icing/absl_ports/thread_annotations.h"
#include "icing/file/filesystem.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/index.h"
#include "icing/index/numeric/numeric-index.h"
#include "icing/jni/jni-cache.h"
@@ -672,6 +673,12 @@ class IcingSearchEngine {
libtextclassifier3::StatusOr<bool> LostPreviousSchema()
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ // Helper method to create all types of data indexing handlers to index term,
+ // integer, and joinable qualified ids.
+ libtextclassifier3::StatusOr<
+ std::vector<std::unique_ptr<DataIndexingHandler>>>
+ CreateDataIndexingHandlers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
// Helper method to discard parts of (term, integer) indices if they contain
// data for document ids greater than last_stored_document_id.
//
diff --git a/icing/index/section-indexing-handler.h b/icing/index/data-indexing-handler.h
index 98efc8f..0061b79 100644
--- a/icing/index/section-indexing-handler.h
+++ b/icing/index/data-indexing-handler.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_SECTION_INDEXING_HANDLER_H_
-#define ICING_INDEX_SECTION_INDEXING_HANDLER_H_
+#ifndef ICING_INDEX_DATA_INDEXING_HANDLER_H_
+#define ICING_INDEX_DATA_INDEXING_HANDLER_H_
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/proto/logging.pb.h"
@@ -24,24 +24,23 @@
namespace icing {
namespace lib {
-// Parent class for indexing different types of sections in TokenizedDocument.
-class SectionIndexingHandler {
+// Parent class for indexing different types of data in TokenizedDocument.
+class DataIndexingHandler {
public:
- explicit SectionIndexingHandler(const Clock* clock) : clock_(*clock) {}
+ explicit DataIndexingHandler(const Clock* clock) : clock_(*clock) {}
- virtual ~SectionIndexingHandler() = default;
+ virtual ~DataIndexingHandler() = default;
- // Handles the indexing process: add data (hits) into the specific type index
- // (e.g. term index, integer index) for all contents in the corresponding type
- // of sections in tokenized_document.
+ // Handles the indexing process: add data into the specific type index (e.g.
+ // term index, integer index, qualified id type joinable index) for all
+ // contents in the corresponding type of data in tokenized_document.
// For example, IntegerSectionIndexingHandler::Handle should add data into
// integer index for all contents in tokenized_document.integer_sections.
//
// Also it should handle last added DocumentId properly (based on
// recovery_mode_) to avoid adding previously indexed documents.
//
- // tokenized_document: document object with different types of tokenized
- // sections.
+ // tokenized_document: document object with different types of tokenized data.
// document_id: id of the document.
// recovery_mode: decides how to handle document_id <=
// last_added_document_id. If in recovery_mode, then
@@ -60,10 +59,10 @@ class SectionIndexingHandler {
bool recovery_mode, PutDocumentStatsProto* put_document_stats) = 0;
protected:
- const Clock& clock_;
+ const Clock& clock_; // Does not own.
};
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_SECTION_INDEXING_HANDLER_H_
+#endif // ICING_INDEX_DATA_INDEXING_HANDLER_H_
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 86a0826..34988f5 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -21,44 +21,21 @@
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/index/index.h"
-#include "icing/index/integer-section-indexing-handler.h"
-#include "icing/index/numeric/numeric-index.h"
-#include "icing/index/string-section-indexing-handler.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/proto/logging.pb.h"
#include "icing/store/document-id.h"
-#include "icing/transform/normalizer.h"
#include "icing/util/status-macros.h"
#include "icing/util/tokenized-document.h"
namespace icing {
namespace lib {
-libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>>
-IndexProcessor::Create(const Normalizer* normalizer, Index* index,
- NumericIndex<int64_t>* integer_index, const Clock* clock,
- bool recovery_mode) {
- ICING_RETURN_ERROR_IF_NULL(normalizer);
- ICING_RETURN_ERROR_IF_NULL(index);
- ICING_RETURN_ERROR_IF_NULL(integer_index);
- ICING_RETURN_ERROR_IF_NULL(clock);
-
- std::vector<std::unique_ptr<SectionIndexingHandler>> handlers;
- handlers.push_back(
- std::make_unique<StringSectionIndexingHandler>(clock, normalizer, index));
- handlers.push_back(
- std::make_unique<IntegerSectionIndexingHandler>(clock, integer_index));
-
- return std::unique_ptr<IndexProcessor>(
- new IndexProcessor(std::move(handlers), clock, recovery_mode));
-}
-
libtextclassifier3::Status IndexProcessor::IndexDocument(
const TokenizedDocument& tokenized_document, DocumentId document_id,
PutDocumentStatsProto* put_document_stats) {
// TODO(b/259744228): set overall index latency.
- for (auto& section_indexing_handler : section_indexing_handlers_) {
- ICING_RETURN_IF_ERROR(section_indexing_handler->Handle(
+ for (auto& data_indexing_handler : data_indexing_handlers_) {
+ ICING_RETURN_IF_ERROR(data_indexing_handler->Handle(
tokenized_document, document_id, recovery_mode_, put_document_stats));
}
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
index 3d6b19a..9b96f00 100644
--- a/icing/index/index-processor.h
+++ b/icing/index/index-processor.h
@@ -20,12 +20,9 @@
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/index/index.h"
-#include "icing/index/numeric/numeric-index.h"
-#include "icing/index/section-indexing-handler.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/proto/logging.pb.h"
#include "icing/store/document-id.h"
-#include "icing/transform/normalizer.h"
#include "icing/util/tokenized-document.h"
namespace icing {
@@ -33,24 +30,12 @@ namespace lib {
class IndexProcessor {
public:
- // Factory function to create an IndexProcessor which does not take ownership
- // of any input components, and all pointers must refer to valid objects that
- // outlive the created IndexProcessor instance.
- //
- // - recovery_mode: a flag indicates that if IndexProcessor is used to restore
- // index. Since there are several indices (term, integer) being restored at
- // the same time, we start with the minimum last added DocumentId of all
- // indices and replay documents to re-index, so it is possible to get some
- // previously indexed documents in the recovery mode. Therefore, we should
- // skip them without returning an error in recovery mode.
- //
- // Returns:
- // An IndexProcessor on success
- // FAILED_PRECONDITION if any of the pointers is null.
- static libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> Create(
- const Normalizer* normalizer, Index* index,
- NumericIndex<int64_t>* integer_index_, const Clock* clock,
- bool recovery_mode = false);
+ explicit IndexProcessor(std::vector<std::unique_ptr<DataIndexingHandler>>&&
+ data_indexing_handlers,
+ const Clock* clock, bool recovery_mode = false)
+ : data_indexing_handlers_(std::move(data_indexing_handlers)),
+ clock_(*clock),
+ recovery_mode_(recovery_mode) {}
// Add tokenized document to the index, associated with document_id. If the
// number of tokens in the document exceeds max_tokens_per_document, then only
@@ -65,22 +50,14 @@ class IndexProcessor {
//
// Returns:
// - OK on success.
- // - Any SectionIndexingHandler errors.
+ // - Any DataIndexingHandler errors.
libtextclassifier3::Status IndexDocument(
const TokenizedDocument& tokenized_document, DocumentId document_id,
PutDocumentStatsProto* put_document_stats = nullptr);
private:
- explicit IndexProcessor(std::vector<std::unique_ptr<SectionIndexingHandler>>&&
- section_indexing_handlers,
- const Clock* clock, bool recovery_mode)
- : section_indexing_handlers_(std::move(section_indexing_handlers)),
- clock_(*clock),
- recovery_mode_(recovery_mode) {}
-
- std::vector<std::unique_ptr<SectionIndexingHandler>>
- section_indexing_handlers_;
- const Clock& clock_;
+ std::vector<std::unique_ptr<DataIndexingHandler>> data_indexing_handlers_;
+ const Clock& clock_; // Does not own.
bool recovery_mode_;
};
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 6608e44..ee43364 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -12,14 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
-#include "icing/index/numeric/dummy-numeric-index.h"
+#include "icing/index/integer-section-indexing-handler.h"
+#include "icing/index/numeric/integer-index.h"
#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/string-section-indexing-handler.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/schema-util.h"
@@ -173,6 +181,24 @@ std::unique_ptr<SchemaStore> CreateSchemaStore(const Filesystem& filesystem,
return schema_store;
}
+libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
+CreateDataIndexingHandlers(const Clock* clock, const Normalizer* normalizer,
+ Index* index, NumericIndex<int64_t>* integer_index) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler,
+ StringSectionIndexingHandler::Create(clock, normalizer, index));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(clock, integer_index));
+
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(string_section_indexing_handler));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+ return handlers;
+}
+
void CleanUp(const Filesystem& filesystem, const std::string& base_dir) {
filesystem.DeleteDirectoryRecursively(base_dir.c_str());
}
@@ -198,7 +224,7 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
CreateIndex(icing_filesystem, filesystem, index_dir);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<NumericIndex<int64_t>> integer_index,
- DummyNumericIndex<int64_t>::Create(filesystem, integer_index_dir));
+ IntegerIndex::Create(filesystem, integer_index_dir));
language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
language_segmenter_factory::Create(std::move(options)).ValueOrDie();
@@ -206,10 +232,14 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
Clock clock;
std::unique_ptr<SchemaStore> schema_store =
CreateSchemaStore(filesystem, &clock, base_dir);
+
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer.get(), index.get(), integer_index.get(),
- &clock));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers,
+ CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(),
+ integer_index.get()));
+ auto index_processor =
+ std::make_unique<IndexProcessor>(std::move(handlers), &clock);
+
DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0));
TokenizedDocument tokenized_document(std::move(
TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
@@ -268,7 +298,7 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
CreateIndex(icing_filesystem, filesystem, index_dir);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<NumericIndex<int64_t>> integer_index,
- DummyNumericIndex<int64_t>::Create(filesystem, integer_index_dir));
+ IntegerIndex::Create(filesystem, integer_index_dir));
language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
language_segmenter_factory::Create(std::move(options)).ValueOrDie();
@@ -276,10 +306,13 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
Clock clock;
std::unique_ptr<SchemaStore> schema_store =
CreateSchemaStore(filesystem, &clock, base_dir);
+
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer.get(), index.get(), integer_index.get(),
- &clock));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers,
+ CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(),
+ integer_index.get()));
+ auto index_processor =
+ std::make_unique<IndexProcessor>(std::move(handlers), &clock);
DocumentProto input_document =
CreateDocumentWithTenProperties(state.range(0));
@@ -340,7 +373,7 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
CreateIndex(icing_filesystem, filesystem, index_dir);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<NumericIndex<int64_t>> integer_index,
- DummyNumericIndex<int64_t>::Create(filesystem, integer_index_dir));
+ IntegerIndex::Create(filesystem, integer_index_dir));
language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
language_segmenter_factory::Create(std::move(options)).ValueOrDie();
@@ -348,10 +381,13 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
Clock clock;
std::unique_ptr<SchemaStore> schema_store =
CreateSchemaStore(filesystem, &clock, base_dir);
+
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer.get(), index.get(), integer_index.get(),
- &clock));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers,
+ CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(),
+ integer_index.get()));
+ auto index_processor =
+ std::make_unique<IndexProcessor>(std::move(handlers), &clock);
DocumentProto input_document =
CreateDocumentWithDiacriticLetters(state.range(0));
@@ -412,7 +448,7 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
CreateIndex(icing_filesystem, filesystem, index_dir);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<NumericIndex<int64_t>> integer_index,
- DummyNumericIndex<int64_t>::Create(filesystem, integer_index_dir));
+ IntegerIndex::Create(filesystem, integer_index_dir));
language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
language_segmenter_factory::Create(std::move(options)).ValueOrDie();
@@ -420,10 +456,13 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
Clock clock;
std::unique_ptr<SchemaStore> schema_store =
CreateSchemaStore(filesystem, &clock, base_dir);
+
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer.get(), index.get(), integer_index.get(),
- &clock));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers,
+ CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(),
+ integer_index.get()));
+ auto index_processor =
+ std::make_unique<IndexProcessor>(std::move(handlers), &clock);
DocumentProto input_document = CreateDocumentWithHiragana(state.range(0));
TokenizedDocument tokenized_document(std::move(
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index c22e8f0..3a9b4ee 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -30,12 +30,15 @@
#include "icing/absl_ports/str_join.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
+#include "icing/index/integer-section-indexing-handler.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/numeric/integer-index.h"
#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/string-section-indexing-handler.h"
#include "icing/index/term-property-id.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
@@ -258,9 +261,21 @@ class IndexProcessorTest : public Test {
ICING_ASSERT_OK(schema_store_->SetSchema(schema));
ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(normalizer_.get(), index_.get(),
- integer_index_.get(), &fake_clock_));
+ std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler,
+ StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(),
+ index_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(
+ &fake_clock_, integer_index_.get()));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(string_section_indexing_handler));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+
+ index_processor_ =
+ std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
+
mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>();
}
@@ -290,6 +305,7 @@ class IndexProcessorTest : public Test {
std::unique_ptr<LanguageSegmenter> lang_segmenter_;
std::unique_ptr<Normalizer> normalizer_;
std::unique_ptr<SchemaStore> schema_store_;
+
std::unique_ptr<IndexProcessor> index_processor_;
};
@@ -315,16 +331,6 @@ std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency(
return infos;
}
-TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(IndexProcessor::Create(/*normalizer=*/nullptr, index_.get(),
- integer_index_.get(), &fake_clock_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
- EXPECT_THAT(IndexProcessor::Create(normalizer_.get(), /*index=*/nullptr,
- integer_index_.get(), &fake_clock_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-}
-
TEST_F(IndexProcessorTest, NoTermMatchTypeContent) {
DocumentProto document =
DocumentBuilder()
@@ -584,10 +590,15 @@ TEST_F(IndexProcessorTest, TooLongTokens) {
normalizer_factory::Create(
/*max_term_byte_size=*/4));
- ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(normalizer.get(), index_.get(),
- integer_index_.get(), &fake_clock_));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler,
+ StringSectionIndexingHandler::Create(
+ &fake_clock_, normalizer.get(), index_.get()));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(string_section_indexing_handler));
+
+ index_processor_ =
+ std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
DocumentProto document =
DocumentBuilder()
@@ -769,10 +780,20 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(normalizer_.get(), index_.get(),
- integer_index_.get(), &fake_clock_,
- /*recovery_mode=*/true));
+ std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler,
+ StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(),
+ index_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(
+ &fake_clock_, integer_index_.get()));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(string_section_indexing_handler));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+
+ IndexProcessor index_processor(std::move(handlers), &fake_clock_,
+ /*recovery_mode=*/true);
DocumentProto document =
DocumentBuilder()
@@ -785,7 +806,7 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) {
TokenizedDocument tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
document));
- EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId1),
+ EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId1),
IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
@@ -808,7 +829,7 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) {
tokenized_document,
TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
document));
- EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId0),
+ EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId0),
IsOk());
// Verify that both index_ and integer_index_ are unchanged.
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
@@ -818,7 +839,7 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) {
IsOkAndHolds(integer_index_crc));
// As should indexing a document document_id == last_added_document_id.
- EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId1),
+ EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId1),
IsOk());
// Verify that both index_ and integer_index_ are unchanged.
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
@@ -907,9 +928,16 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
index_, Index::Create(options, &filesystem_, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(normalizer_.get(), index_.get(),
- integer_index_.get(), &fake_clock_));
+ std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler,
+ StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(),
+ index_.get()));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(string_section_indexing_handler));
+
+ index_processor_ =
+ std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
+
DocumentId doc_id = 0;
// Have determined experimentally that indexing 3373 documents with this text
// will cause the LiteIndex to fill up. Further indexing will fail unless the
@@ -964,9 +992,15 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
Index::Create(options, &filesystem_, mock_icing_filesystem_.get()));
ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(normalizer_.get(), index_.get(),
- integer_index_.get(), &fake_clock_));
+ std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler,
+ StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(),
+ index_.get()));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(string_section_indexing_handler));
+
+ index_processor_ =
+ std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
// 3. Index one document. This should fit in the LiteIndex without requiring a
// merge.
diff --git a/icing/index/integer-section-indexing-handler.cc b/icing/index/integer-section-indexing-handler.cc
index 0ed01d3..d201a1a 100644
--- a/icing/index/integer-section-indexing-handler.cc
+++ b/icing/index/integer-section-indexing-handler.cc
@@ -14,8 +14,11 @@
#include "icing/index/integer-section-indexing-handler.h"
+#include <cstdint>
+#include <memory>
+
#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/schema/section-manager.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/util/logging.h"
@@ -24,6 +27,17 @@
namespace icing {
namespace lib {
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<IntegerSectionIndexingHandler>>
+IntegerSectionIndexingHandler::Create(const Clock* clock,
+ NumericIndex<int64_t>* integer_index) {
+ ICING_RETURN_ERROR_IF_NULL(clock);
+ ICING_RETURN_ERROR_IF_NULL(integer_index);
+
+ return std::unique_ptr<IntegerSectionIndexingHandler>(
+ new IntegerSectionIndexingHandler(clock, integer_index));
+}
+
libtextclassifier3::Status IntegerSectionIndexingHandler::Handle(
const TokenizedDocument& tokenized_document, DocumentId document_id,
bool recovery_mode, PutDocumentStatsProto* put_document_stats) {
diff --git a/icing/index/integer-section-indexing-handler.h b/icing/index/integer-section-indexing-handler.h
index d75815c..42ce07e 100644
--- a/icing/index/integer-section-indexing-handler.h
+++ b/icing/index/integer-section-indexing-handler.h
@@ -15,9 +15,13 @@
#ifndef ICING_INDEX_INTEGER_SECTION_INDEXING_HANDLER_H_
#define ICING_INDEX_INTEGER_SECTION_INDEXING_HANDLER_H_
+#include <cstdint>
+#include <memory>
+
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/numeric/numeric-index.h"
-#include "icing/index/section-indexing-handler.h"
#include "icing/store/document-id.h"
#include "icing/util/clock.h"
#include "icing/util/tokenized-document.h"
@@ -25,11 +29,18 @@
namespace icing {
namespace lib {
-class IntegerSectionIndexingHandler : public SectionIndexingHandler {
+class IntegerSectionIndexingHandler : public DataIndexingHandler {
public:
- explicit IntegerSectionIndexingHandler(const Clock* clock,
- NumericIndex<int64_t>* integer_index)
- : SectionIndexingHandler(clock), integer_index_(*integer_index) {}
+ // Creates an IntegerSectionIndexingHandler instance which does not take
+ // ownership of any input components. All pointers must refer to valid objects
+ // that outlive the created IntegerSectionIndexingHandler instance.
+ //
+ // Returns:
+ // - An IntegerSectionIndexingHandler instance on success
+ // - FAILED_PRECONDITION_ERROR if any of the input pointer is null
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<IntegerSectionIndexingHandler>>
+ Create(const Clock* clock, NumericIndex<int64_t>* integer_index);
~IntegerSectionIndexingHandler() override = default;
@@ -46,7 +57,11 @@ class IntegerSectionIndexingHandler : public SectionIndexingHandler {
bool recovery_mode, PutDocumentStatsProto* put_document_stats) override;
private:
- NumericIndex<int64_t>& integer_index_;
+ explicit IntegerSectionIndexingHandler(const Clock* clock,
+ NumericIndex<int64_t>* integer_index)
+ : DataIndexingHandler(clock), integer_index_(*integer_index) {}
+
+ NumericIndex<int64_t>& integer_index_; // Does not own.
};
} // namespace lib
diff --git a/icing/index/numeric/integer-index.h b/icing/index/numeric/integer-index.h
index 98c26ef..050a143 100644
--- a/icing/index/numeric/integer-index.h
+++ b/icing/index/numeric/integer-index.h
@@ -216,7 +216,7 @@ class IntegerIndex : public NumericIndex<int64_t> {
// Returns:
// - OK on success
// - INTERNAL_ERROR on I/O error. This could potentially leave the storages
- // in an invalid state and the caller should handle it property (e.g.
+ // in an invalid state and the caller should handle it properly (e.g.
// discard and rebuild)
libtextclassifier3::Status TransferIndex(
const std::vector<DocumentId>& document_id_old_to_new,
diff --git a/icing/index/string-section-indexing-handler.cc b/icing/index/string-section-indexing-handler.cc
index 7cd0909..83a2687 100644
--- a/icing/index/string-section-indexing-handler.cc
+++ b/icing/index/string-section-indexing-handler.cc
@@ -20,6 +20,7 @@
#include <string_view>
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/index/index.h"
#include "icing/legacy/core/icing-string-util.h"
@@ -34,6 +35,19 @@
namespace icing {
namespace lib {
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<StringSectionIndexingHandler>>
+StringSectionIndexingHandler::Create(const Clock* clock,
+ const Normalizer* normalizer,
+ Index* index) {
+ ICING_RETURN_ERROR_IF_NULL(clock);
+ ICING_RETURN_ERROR_IF_NULL(normalizer);
+ ICING_RETURN_ERROR_IF_NULL(index);
+
+ return std::unique_ptr<StringSectionIndexingHandler>(
+ new StringSectionIndexingHandler(clock, normalizer, index));
+}
+
libtextclassifier3::Status StringSectionIndexingHandler::Handle(
const TokenizedDocument& tokenized_document, DocumentId document_id,
bool recovery_mode, PutDocumentStatsProto* put_document_stats) {
diff --git a/icing/index/string-section-indexing-handler.h b/icing/index/string-section-indexing-handler.h
index 36f6a05..6abfba5 100644
--- a/icing/index/string-section-indexing-handler.h
+++ b/icing/index/string-section-indexing-handler.h
@@ -15,9 +15,12 @@
#ifndef ICING_INDEX_STRING_SECTION_INDEXING_HANDLER_H_
#define ICING_INDEX_STRING_SECTION_INDEXING_HANDLER_H_
+#include <memory>
+
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/index.h"
-#include "icing/index/section-indexing-handler.h"
#include "icing/proto/logging.pb.h"
#include "icing/store/document-id.h"
#include "icing/transform/normalizer.h"
@@ -27,14 +30,18 @@
namespace icing {
namespace lib {
-class StringSectionIndexingHandler : public SectionIndexingHandler {
+class StringSectionIndexingHandler : public DataIndexingHandler {
public:
- explicit StringSectionIndexingHandler(const Clock* clock,
- const Normalizer* normalizer,
- Index* index)
- : SectionIndexingHandler(clock),
- normalizer_(*normalizer),
- index_(*index) {}
+ // Creates a StringSectionIndexingHandler instance which does not take
+ // ownership of any input components. All pointers must refer to valid objects
+ // that outlive the created StringSectionIndexingHandler instance.
+ //
+ // Returns:
+ // - A StringSectionIndexingHandler instance on success
+ // - FAILED_PRECONDITION_ERROR if any of the input pointer is null
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<StringSectionIndexingHandler>>
+ Create(const Clock* clock, const Normalizer* normalizer, Index* index);
~StringSectionIndexingHandler() override = default;
@@ -57,8 +64,13 @@ class StringSectionIndexingHandler : public SectionIndexingHandler {
bool recovery_mode, PutDocumentStatsProto* put_document_stats) override;
private:
- const Normalizer& normalizer_;
- Index& index_;
+ explicit StringSectionIndexingHandler(const Clock* clock,
+ const Normalizer* normalizer,
+ Index* index)
+ : DataIndexingHandler(clock), normalizer_(*normalizer), index_(*index) {}
+
+ const Normalizer& normalizer_; // Does not own.
+ Index& index_; // Does not own.
};
} // namespace lib
diff --git a/icing/join/qualified-id-type-joinable-cache_test.cc b/icing/join/qualified-id-type-joinable-cache_test.cc
deleted file mode 100644
index 088c878..0000000
--- a/icing/join/qualified-id-type-joinable-cache_test.cc
+++ /dev/null
@@ -1,496 +0,0 @@
-// Copyright (C) 2023 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/join/qualified-id-type-joinable-cache.h"
-
-#include <memory>
-#include <string>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/file/filesystem.h"
-#include "icing/file/persistent-storage.h"
-#include "icing/join/doc-join-info.h"
-#include "icing/store/document-id.h"
-#include "icing/store/persistent-hash-map-key-mapper.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/tmp-directory.h"
-#include "icing/util/crc32.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-using ::testing::Eq;
-using ::testing::HasSubstr;
-using ::testing::IsTrue;
-using ::testing::Ne;
-using ::testing::Not;
-
-using Crcs = PersistentStorage::Crcs;
-using Info = QualifiedIdTypeJoinableCache::Info;
-
-static constexpr int32_t kCorruptedValueOffset = 3;
-
-class QualifiedIdTypeJoinableCacheTest : public ::testing::Test {
- protected:
- void SetUp() override {
- base_dir_ = GetTestTempDir() + "/icing";
- ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
- IsTrue());
-
- working_path_ = base_dir_ + "/qualified_id_type_joinable_cache_test";
- }
-
- void TearDown() override {
- filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
- }
-
- Filesystem filesystem_;
- std::string base_dir_;
- std::string working_path_;
-};
-
-TEST_F(QualifiedIdTypeJoinableCacheTest, InvalidWorkingPath) {
- EXPECT_THAT(
- QualifiedIdTypeJoinableCache::Create(
- filesystem_, "/dev/null/qualified_id_type_joinable_cache_test"),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest, InitializeNewFiles) {
- {
- // Create new qualified id type joinable cache
- ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str()));
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
-
- ICING_ASSERT_OK(cache->PersistToDisk());
- }
-
- // Metadata file should be initialized correctly for both info and crcs
- // sections.
- const std::string metadata_file_path = absl_ports::StrCat(
- working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, ".m");
- auto metadata_buffer = std::make_unique<uint8_t[]>(
- QualifiedIdTypeJoinableCache::kMetadataFileSize);
- ASSERT_THAT(
- filesystem_.PRead(metadata_file_path.c_str(), metadata_buffer.get(),
- QualifiedIdTypeJoinableCache::kMetadataFileSize,
- /*offset=*/0),
- IsTrue());
-
- // Check info section
- const Info* info = reinterpret_cast<const Info*>(
- metadata_buffer.get() +
- QualifiedIdTypeJoinableCache::kInfoMetadataBufferOffset);
- EXPECT_THAT(info->magic, Eq(Info::kMagic));
- EXPECT_THAT(info->last_added_document_id, Eq(kInvalidDocumentId));
-
- // Check crcs section
- const Crcs* crcs = reinterpret_cast<const Crcs*>(
- metadata_buffer.get() +
- QualifiedIdTypeJoinableCache::kCrcsMetadataBufferOffset);
- // There are some initial info in KeyMapper, so storages_crc should be
- // non-zero.
- EXPECT_THAT(crcs->component_crcs.storages_crc, Ne(0));
- EXPECT_THAT(crcs->component_crcs.info_crc,
- Eq(Crc32(std::string_view(reinterpret_cast<const char*>(info),
- sizeof(Info)))
- .Get()));
- EXPECT_THAT(crcs->all_crc,
- Eq(Crc32(std::string_view(
- reinterpret_cast<const char*>(&crcs->component_crcs),
- sizeof(Crcs::ComponentCrcs)))
- .Get()));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest,
- InitializationShouldFailWithoutPersistToDiskOrDestruction) {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
-
- // Insert some data.
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
- /*ref_document_id=*/0));
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
- /*ref_document_id=*/2));
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
- /*ref_document_id=*/4));
-
- // Without calling PersistToDisk, checksums will not be recomputed or synced
- // to disk, so initializing another instance on the same files should fail.
- EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest,
- InitializationShouldSucceedWithPersistToDisk) {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache1,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
-
- // Insert some data.
- ICING_ASSERT_OK(
- cache1->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
- /*ref_document_id=*/0));
- ICING_ASSERT_OK(
- cache1->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
- /*ref_document_id=*/2));
- ICING_ASSERT_OK(
- cache1->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
- /*ref_document_id=*/4));
-
- // After calling PersistToDisk, all checksums should be recomputed and synced
- // correctly to disk, so initializing another instance on the same files
- // should succeed, and we should be able to get the same contents.
- ICING_EXPECT_OK(cache1->PersistToDisk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache2,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
- EXPECT_THAT(
- cache2->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)),
- IsOkAndHolds(0));
- EXPECT_THAT(
- cache2->Get(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20)),
- IsOkAndHolds(2));
- EXPECT_THAT(
- cache2->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20)),
- IsOkAndHolds(4));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest,
- InitializationShouldSucceedAfterDestruction) {
- {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
-
- // Insert some data.
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
- /*ref_document_id=*/0));
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
- /*ref_document_id=*/2));
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
- /*ref_document_id=*/4));
- }
-
- {
- // The previous instance went out of scope and was destructed. Although we
- // didn't call PersistToDisk explicitly, the destructor should invoke it and
- // thus initializing another instance on the same files should succeed, and
- // we should be able to get the same contents.
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
- EXPECT_THAT(cache->Get(DocJoinInfo(/*document_id=*/1,
- /*joinable_property_id=*/20)),
- IsOkAndHolds(0));
- EXPECT_THAT(cache->Get(DocJoinInfo(/*document_id=*/3,
- /*joinable_property_id=*/20)),
- IsOkAndHolds(2));
- EXPECT_THAT(cache->Get(DocJoinInfo(/*document_id=*/5,
- /*joinable_property_id=*/20)),
- IsOkAndHolds(4));
- }
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest,
- InitializeExistingFilesWithDifferentMagicShouldFail) {
- {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
- /*ref_document_id=*/0));
-
- ICING_ASSERT_OK(cache->PersistToDisk());
- }
-
- {
- // Manually change magic and update checksum
- const std::string metadata_file_path = absl_ports::StrCat(
- working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, ".m");
- ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
- ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
-
- auto metadata_buffer = std::make_unique<uint8_t[]>(
- QualifiedIdTypeJoinableCache::kMetadataFileSize);
- ASSERT_THAT(
- filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
- QualifiedIdTypeJoinableCache::kMetadataFileSize,
- /*offset=*/0),
- IsTrue());
-
- // Manually change magic and update checksums.
- Crcs* crcs = reinterpret_cast<Crcs*>(
- metadata_buffer.get() +
- QualifiedIdTypeJoinableCache::kCrcsMetadataBufferOffset);
- Info* info = reinterpret_cast<Info*>(
- metadata_buffer.get() +
- QualifiedIdTypeJoinableCache::kInfoMetadataBufferOffset);
- info->magic += kCorruptedValueOffset;
- crcs->component_crcs.info_crc = info->ComputeChecksum().Get();
- crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get();
- ASSERT_THAT(filesystem_.PWrite(
- metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
- QualifiedIdTypeJoinableCache::kMetadataFileSize),
- IsTrue());
- }
-
- // Attempt to create the qualified id type joinable cache with different
- // magic. This should fail.
- EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
- HasSubstr("Incorrect magic value")));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest,
- InitializeExistingFilesWithWrongAllCrcShouldFail) {
- {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
- /*ref_document_id=*/0));
-
- ICING_ASSERT_OK(cache->PersistToDisk());
- }
-
- {
- const std::string metadata_file_path = absl_ports::StrCat(
- working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, ".m");
- ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
- ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
-
- auto metadata_buffer = std::make_unique<uint8_t[]>(
- QualifiedIdTypeJoinableCache::kMetadataFileSize);
- ASSERT_THAT(
- filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
- QualifiedIdTypeJoinableCache::kMetadataFileSize,
- /*offset=*/0),
- IsTrue());
-
- // Manually corrupt all_crc
- Crcs* crcs = reinterpret_cast<Crcs*>(
- metadata_buffer.get() +
- QualifiedIdTypeJoinableCache::kCrcsMetadataBufferOffset);
- crcs->all_crc += kCorruptedValueOffset;
-
- ASSERT_THAT(filesystem_.PWrite(
- metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
- QualifiedIdTypeJoinableCache::kMetadataFileSize),
- IsTrue());
- }
-
- // Attempt to create the qualified id type joinable cache with metadata
- // containing corrupted all_crc. This should fail.
- EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
- HasSubstr("Invalid all crc")));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest,
- InitializeExistingFilesWithCorruptedInfoShouldFail) {
- {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
- /*ref_document_id=*/0));
-
- ICING_ASSERT_OK(cache->PersistToDisk());
- }
-
- {
- const std::string metadata_file_path = absl_ports::StrCat(
- working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, ".m");
- ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
- ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
-
- auto metadata_buffer = std::make_unique<uint8_t[]>(
- QualifiedIdTypeJoinableCache::kMetadataFileSize);
- ASSERT_THAT(
- filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
- QualifiedIdTypeJoinableCache::kMetadataFileSize,
- /*offset=*/0),
- IsTrue());
-
- // Modify info, but don't update the checksum. This would be similar to
- // corruption of info.
- Info* info = reinterpret_cast<Info*>(
- metadata_buffer.get() +
- QualifiedIdTypeJoinableCache::kInfoMetadataBufferOffset);
- info->last_added_document_id += kCorruptedValueOffset;
-
- ASSERT_THAT(filesystem_.PWrite(
- metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
- QualifiedIdTypeJoinableCache::kMetadataFileSize),
- IsTrue());
- }
-
- // Attempt to create the qualified id type joinable cache with info that
- // doesn't match its checksum. This should fail.
- EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
- HasSubstr("Invalid info crc")));
-}
-
-TEST_F(
- QualifiedIdTypeJoinableCacheTest,
- InitializeExistingFilesWithCorruptedDocumentToQualifiedIdMapperShouldFail) {
- {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
- ICING_ASSERT_OK(
- cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
- /*ref_document_id=*/0));
-
- ICING_ASSERT_OK(cache->PersistToDisk());
- }
-
- {
- // Corrupt document_to_qualified_id_mapper manually.
- std::string mapper_working_path = absl_ports::StrCat(
- working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix,
- "_mapper");
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<PersistentHashMapKeyMapper<DocumentId>> mapper,
- PersistentHashMapKeyMapper<DocumentId>::Create(
- filesystem_, std::move(mapper_working_path)));
- ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, mapper->ComputeChecksum());
- ICING_ASSERT_OK(mapper->Put("foo", 12345));
- ICING_ASSERT_OK(mapper->PersistToDisk());
- ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, mapper->ComputeChecksum());
- ASSERT_THAT(old_crc, Not(Eq(new_crc)));
- }
-
- // Attempt to create the qualified id type joinable cache with corrupted
- // document_to_qualified_id_mapper. This should fail.
- EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
- HasSubstr("Invalid storages crc")));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest, InvalidPut) {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
-
- DocJoinInfo default_invalid;
- EXPECT_THAT(cache->Put(default_invalid, /*ref_document_id=*/0),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest, InvalidGet) {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
-
- DocJoinInfo default_invalid;
- EXPECT_THAT(cache->Get(default_invalid),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest, PutAndGet) {
- DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20);
- DocumentId ref_document1 = 0;
-
- DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/13);
- DocumentId ref_document2 = 2;
-
- DocJoinInfo target_info3(/*document_id=*/4, /*joinable_property_id=*/4);
- DocumentId ref_document3 = ref_document1;
-
- {
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
-
- EXPECT_THAT(cache->Put(target_info1, /*ref_document_id=*/ref_document1),
- IsOk());
- EXPECT_THAT(cache->Put(target_info2, /*ref_document_id=*/ref_document2),
- IsOk());
- EXPECT_THAT(cache->Put(target_info3, /*ref_document_id=*/ref_document3),
- IsOk());
-
- EXPECT_THAT(cache->Get(target_info1), IsOkAndHolds(ref_document1));
- EXPECT_THAT(cache->Get(target_info2), IsOkAndHolds(ref_document2));
- EXPECT_THAT(cache->Get(target_info3), IsOkAndHolds(ref_document3));
-
- ICING_ASSERT_OK(cache->PersistToDisk());
- }
-
- // Verify we can get all of them after destructing and re-initializing.
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
- EXPECT_THAT(cache->Get(target_info1), IsOkAndHolds(ref_document1));
- EXPECT_THAT(cache->Get(target_info2), IsOkAndHolds(ref_document2));
- EXPECT_THAT(cache->Get(target_info3), IsOkAndHolds(ref_document3));
-}
-
-TEST_F(QualifiedIdTypeJoinableCacheTest,
- GetShouldReturnNotFoundErrorIfNotExist) {
- DocJoinInfo target_info(/*document_id=*/1, /*joinable_property_id=*/20);
- DocumentId ref_document = 0;
-
- // Create new qualified id type joinable cache
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QualifiedIdTypeJoinableCache> cache,
- QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_));
-
- // Verify entry is not found in the beginning.
- EXPECT_THAT(cache->Get(target_info),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- ICING_ASSERT_OK(cache->Put(target_info, /*ref_document_id=*/ref_document));
- ASSERT_THAT(cache->Get(target_info), IsOkAndHolds(ref_document));
-
- // Get another non-existing entry. This should get NOT_FOUND_ERROR.
- DocJoinInfo another_target_info(/*document_id=*/2,
- /*joinable_property_id=*/20);
- EXPECT_THAT(cache->Get(another_target_info),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
-} // namespace
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/join/qualified-id-type-joinable-cache.cc b/icing/join/qualified-id-type-joinable-index.cc
index 4dc6e5a..231e78a 100644
--- a/icing/join/qualified-id-type-joinable-cache.cc
+++ b/icing/join/qualified-id-type-joinable-index.cc
@@ -12,16 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/join/qualified-id-type-joinable-cache.h"
+#include "icing/join/qualified-id-type-joinable-index.h"
#include <memory>
#include <string>
#include <string_view>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
#include "icing/file/filesystem.h"
#include "icing/join/doc-join-info.h"
#include "icing/store/document-id.h"
@@ -37,42 +39,50 @@ namespace lib {
namespace {
+DocumentId GetNewDocumentId(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId old_document_id) {
+ if (old_document_id >= document_id_old_to_new.size()) {
+ return kInvalidDocumentId;
+ }
+ return document_id_old_to_new[old_document_id];
+}
+
std::string GetMetadataFilePath(std::string_view working_path) {
return absl_ports::StrCat(working_path, "/",
- QualifiedIdTypeJoinableCache::kFilePrefix, ".m");
+ QualifiedIdTypeJoinableIndex::kFilePrefix, ".m");
}
std::string GetDocumentToQualifiedIdMapperPath(std::string_view working_path) {
return absl_ports::StrCat(
- working_path, "/", QualifiedIdTypeJoinableCache::kFilePrefix, "_mapper");
+ working_path, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, "_mapper");
}
} // namespace
/* static */ libtextclassifier3::StatusOr<
- std::unique_ptr<QualifiedIdTypeJoinableCache>>
-QualifiedIdTypeJoinableCache::Create(const Filesystem& filesystem,
+ std::unique_ptr<QualifiedIdTypeJoinableIndex>>
+QualifiedIdTypeJoinableIndex::Create(const Filesystem& filesystem,
std::string working_path) {
if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
!filesystem.DirectoryExists(
GetDocumentToQualifiedIdMapperPath(working_path).c_str())) {
// Discard working_path if any file/directory is missing, and reinitialize.
- ICING_RETURN_IF_ERROR(
- PersistentStorage::Discard(filesystem, working_path, kWorkingPathType));
+ ICING_RETURN_IF_ERROR(Discard(filesystem, working_path));
return InitializeNewFiles(filesystem, std::move(working_path));
}
return InitializeExistingFiles(filesystem, std::move(working_path));
}
-QualifiedIdTypeJoinableCache::~QualifiedIdTypeJoinableCache() {
+QualifiedIdTypeJoinableIndex::~QualifiedIdTypeJoinableIndex() {
if (!PersistToDisk().ok()) {
- ICING_LOG(WARNING) << "Failed to persist qualified id type joinable cache "
+ ICING_LOG(WARNING) << "Failed to persist qualified id type joinable index "
"to disk while destructing "
<< working_path_;
}
}
-libtextclassifier3::Status QualifiedIdTypeJoinableCache::Put(
+libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Put(
const DocJoinInfo& doc_join_info, DocumentId ref_document_id) {
if (!doc_join_info.is_valid()) {
return absl_ports::InvalidArgumentError(
@@ -82,12 +92,12 @@ libtextclassifier3::Status QualifiedIdTypeJoinableCache::Put(
ICING_RETURN_IF_ERROR(document_to_qualified_id_mapper_->Put(
encode_util::EncodeIntToCString(doc_join_info.value()), ref_document_id));
- // TODO(b/263890397): add delete propagation
+ // TODO(b/268521214): add data into delete propagation storage
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<DocumentId> QualifiedIdTypeJoinableCache::Get(
+libtextclassifier3::StatusOr<DocumentId> QualifiedIdTypeJoinableIndex::Get(
const DocJoinInfo& doc_join_info) const {
if (!doc_join_info.is_valid()) {
return absl_ports::InvalidArgumentError(
@@ -98,9 +108,79 @@ libtextclassifier3::StatusOr<DocumentId> QualifiedIdTypeJoinableCache::Get(
encode_util::EncodeIntToCString(doc_join_info.value()));
}
+libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id) {
+ std::string temp_working_path = working_path_ + "_temp";
+ ICING_RETURN_IF_ERROR(Discard(filesystem_, temp_working_path));
+
+ DestructibleDirectory temp_working_path_ddir(&filesystem_,
+ std::move(temp_working_path));
+ if (!temp_working_path_ddir.is_valid()) {
+ return absl_ports::InternalError(
+ "Unable to create temp directory to build new qualified id type "
+ "joinable index");
+ }
+
+ {
+ // Transfer all data from the current to new qualified id type joinable
+ // index. Also PersistToDisk and destruct the instance after finishing, so
+ // we can safely swap directories later.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> new_index,
+ Create(filesystem_, temp_working_path_ddir.dir()));
+ ICING_RETURN_IF_ERROR(
+ TransferIndex(document_id_old_to_new, new_index.get()));
+ new_index->set_last_added_document_id(new_last_added_document_id);
+ ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
+ }
+
+ // Destruct current index's storage instances to safely swap directories.
+ // TODO(b/268521214): handle delete propagation storage
+ document_to_qualified_id_mapper_.reset();
+
+ if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(),
+ working_path_.c_str())) {
+ return absl_ports::InternalError(
+ "Unable to apply new qualified id type joinable index due to failed "
+ "swap");
+ }
+
+ // Reinitialize qualified id type joinable index.
+ if (!filesystem_.PRead(GetMetadataFilePath(working_path_).c_str(),
+ metadata_buffer_.get(), kMetadataFileSize,
+ /*offset=*/0)) {
+ return absl_ports::InternalError("Fail to read metadata file");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ document_to_qualified_id_mapper_,
+ PersistentHashMapKeyMapper<DocumentId>::Create(
+ filesystem_, GetDocumentToQualifiedIdMapperPath(working_path_)));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Clear() {
+ document_to_qualified_id_mapper_.reset();
+ // Discard and reinitialize document to qualified id mapper.
+ std::string document_to_qualified_id_mapper_path =
+ GetDocumentToQualifiedIdMapperPath(working_path_);
+ ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete(
+ filesystem_, document_to_qualified_id_mapper_path));
+ ICING_ASSIGN_OR_RETURN(
+ document_to_qualified_id_mapper_,
+ PersistentHashMapKeyMapper<DocumentId>::Create(
+ filesystem_, std::move(document_to_qualified_id_mapper_path)));
+
+ // TODO(b/268521214): clear delete propagation storage
+
+ info().last_added_document_id = kInvalidDocumentId;
+ return libtextclassifier3::Status::OK;
+}
+
/* static */ libtextclassifier3::StatusOr<
- std::unique_ptr<QualifiedIdTypeJoinableCache>>
-QualifiedIdTypeJoinableCache::InitializeNewFiles(const Filesystem& filesystem,
+ std::unique_ptr<QualifiedIdTypeJoinableIndex>>
+QualifiedIdTypeJoinableIndex::InitializeNewFiles(const Filesystem& filesystem,
std::string&& working_path) {
// Create working directory.
if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) {
@@ -116,25 +196,25 @@ QualifiedIdTypeJoinableCache::InitializeNewFiles(const Filesystem& filesystem,
filesystem, GetDocumentToQualifiedIdMapperPath(working_path)));
// Create instance.
- auto new_type_joinable_cache = std::unique_ptr<QualifiedIdTypeJoinableCache>(
- new QualifiedIdTypeJoinableCache(
+ auto new_index = std::unique_ptr<QualifiedIdTypeJoinableIndex>(
+ new QualifiedIdTypeJoinableIndex(
filesystem, std::move(working_path),
/*metadata_buffer=*/std::make_unique<uint8_t[]>(kMetadataFileSize),
std::move(document_to_qualified_id_mapper)));
// Initialize info content.
- new_type_joinable_cache->info().magic = Info::kMagic;
- new_type_joinable_cache->info().last_added_document_id = kInvalidDocumentId;
+ new_index->info().magic = Info::kMagic;
+ new_index->info().last_added_document_id = kInvalidDocumentId;
// Initialize new PersistentStorage. The initial checksums will be computed
// and set via InitializeNewStorage. Also write them into disk as well.
- ICING_RETURN_IF_ERROR(new_type_joinable_cache->InitializeNewStorage());
- ICING_RETURN_IF_ERROR(new_type_joinable_cache->PersistMetadataToDisk());
+ ICING_RETURN_IF_ERROR(new_index->InitializeNewStorage());
+ ICING_RETURN_IF_ERROR(new_index->PersistMetadataToDisk());
- return new_type_joinable_cache;
+ return new_index;
}
/* static */ libtextclassifier3::StatusOr<
- std::unique_ptr<QualifiedIdTypeJoinableCache>>
-QualifiedIdTypeJoinableCache::InitializeExistingFiles(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex>>
+QualifiedIdTypeJoinableIndex::InitializeExistingFiles(
const Filesystem& filesystem, std::string&& working_path) {
// PRead metadata file.
auto metadata_buffer = std::make_unique<uint8_t[]>(kMetadataFileSize);
@@ -151,23 +231,53 @@ QualifiedIdTypeJoinableCache::InitializeExistingFiles(
filesystem, GetDocumentToQualifiedIdMapperPath(working_path)));
// Create instance.
- auto type_joinable_cache = std::unique_ptr<QualifiedIdTypeJoinableCache>(
- new QualifiedIdTypeJoinableCache(
+ auto type_joinable_index = std::unique_ptr<QualifiedIdTypeJoinableIndex>(
+ new QualifiedIdTypeJoinableIndex(
filesystem, std::move(working_path), std::move(metadata_buffer),
std::move(document_to_qualified_id_mapper)));
// Initialize existing PersistentStorage. Checksums will be validated.
- ICING_RETURN_IF_ERROR(type_joinable_cache->InitializeExistingStorage());
+ ICING_RETURN_IF_ERROR(type_joinable_index->InitializeExistingStorage());
// Validate magic.
- if (type_joinable_cache->info().magic != Info::kMagic) {
+ if (type_joinable_index->info().magic != Info::kMagic) {
return absl_ports::FailedPreconditionError("Incorrect magic value");
}
- return type_joinable_cache;
+ return type_joinable_index;
+}
+
+libtextclassifier3::Status QualifiedIdTypeJoinableIndex::TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ QualifiedIdTypeJoinableIndex* new_index) const {
+ std::unique_ptr<KeyMapper<DocumentId>::Iterator> iter =
+ document_to_qualified_id_mapper_->GetIterator();
+ while (iter->Advance()) {
+ DocJoinInfo old_doc_join_info(
+ encode_util::DecodeIntFromCString(iter->GetKey()));
+ DocumentId old_ref_document_id = iter->GetValue();
+
+ // Translate to new doc ids.
+ DocumentId new_document_id = GetNewDocumentId(
+ document_id_old_to_new, old_doc_join_info.document_id());
+ DocumentId new_ref_document_id =
+ GetNewDocumentId(document_id_old_to_new, old_ref_document_id);
+
+ if (new_document_id != kInvalidDocumentId &&
+ new_ref_document_id != kInvalidDocumentId) {
+ ICING_RETURN_IF_ERROR(
+ new_index->Put(DocJoinInfo(new_document_id,
+ old_doc_join_info.joinable_property_id()),
+ new_ref_document_id));
+ }
+ }
+
+ // TODO(b/268521214): transfer delete propagation storage
+
+ return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status
-QualifiedIdTypeJoinableCache::PersistMetadataToDisk() {
+QualifiedIdTypeJoinableIndex::PersistMetadataToDisk() {
std::string metadata_file_path = GetMetadataFilePath(working_path_);
ScopedFd sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
@@ -188,17 +298,17 @@ QualifiedIdTypeJoinableCache::PersistMetadataToDisk() {
}
libtextclassifier3::Status
-QualifiedIdTypeJoinableCache::PersistStoragesToDisk() {
+QualifiedIdTypeJoinableIndex::PersistStoragesToDisk() {
return document_to_qualified_id_mapper_->PersistToDisk();
}
libtextclassifier3::StatusOr<Crc32>
-QualifiedIdTypeJoinableCache::ComputeInfoChecksum() {
+QualifiedIdTypeJoinableIndex::ComputeInfoChecksum() {
return info().ComputeChecksum();
}
libtextclassifier3::StatusOr<Crc32>
-QualifiedIdTypeJoinableCache::ComputeStoragesChecksum() {
+QualifiedIdTypeJoinableIndex::ComputeStoragesChecksum() {
return document_to_qualified_id_mapper_->ComputeChecksum();
}
diff --git a/icing/join/qualified-id-type-joinable-cache.h b/icing/join/qualified-id-type-joinable-index.h
index 08f6455..794f33f 100644
--- a/icing/join/qualified-id-type-joinable-cache.h
+++ b/icing/join/qualified-id-type-joinable-index.h
@@ -12,13 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_CACHE_H_
-#define ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_CACHE_H_
+#ifndef ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_
+#define ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_
#include <cstdint>
#include <memory>
#include <string>
#include <string_view>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
@@ -32,9 +33,9 @@
namespace icing {
namespace lib {
-// QualifiedIdTypeJoinableCache: a class to maintain cache data mapping
-// DocJoinInfo to joinable qualified ids and delete propagation info.
-class QualifiedIdTypeJoinableCache : public PersistentStorage {
+// QualifiedIdTypeJoinableIndex: a class to maintain data mapping DocJoinInfo to
+// joinable qualified ids and delete propagation info.
+class QualifiedIdTypeJoinableIndex : public PersistentStorage {
public:
struct Info {
static constexpr int32_t kMagic = 0x48cabdc6;
@@ -58,16 +59,17 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage {
static constexpr WorkingPathType kWorkingPathType =
WorkingPathType::kDirectory;
- static constexpr std::string_view kFilePrefix = "qualified_id_joinable_cache";
+ static constexpr std::string_view kFilePrefix =
+ "qualified_id_type_joinable_index";
- // Creates a QualifiedIdTypeJoinableCache instance to store qualified ids for
+ // Creates a QualifiedIdTypeJoinableIndex instance to store qualified ids for
// future joining search. If any of the underlying file is missing, then
// delete the whole working_path and (re)initialize with new ones. Otherwise
// initialize and create the instance by existing files.
//
// filesystem: Object to make system level calls
// working_path: Specifies the working path for PersistentStorage.
- // QualifiedIdTypeJoinableCache uses working path as working
+ // QualifiedIdTypeJoinableIndex uses working path as working
// directory and all related files will be stored under this
// directory. It takes full ownership and of working_path_,
// including creation/deletion. It is the caller's
@@ -84,21 +86,32 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage {
// - INTERNAL_ERROR on I/O errors
// - Any KeyMapper errors
static libtextclassifier3::StatusOr<
- std::unique_ptr<QualifiedIdTypeJoinableCache>>
+ std::unique_ptr<QualifiedIdTypeJoinableIndex>>
Create(const Filesystem& filesystem, std::string working_path);
+ // Deletes QualifiedIdTypeJoinableIndex under working_path.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ static libtextclassifier3::Status Discard(const Filesystem& filesystem,
+ const std::string& working_path) {
+ return PersistentStorage::Discard(filesystem, working_path,
+ kWorkingPathType);
+ }
+
// Delete copy and move constructor/assignment operator.
- QualifiedIdTypeJoinableCache(const QualifiedIdTypeJoinableCache&) = delete;
- QualifiedIdTypeJoinableCache& operator=(const QualifiedIdTypeJoinableCache&) =
+ QualifiedIdTypeJoinableIndex(const QualifiedIdTypeJoinableIndex&) = delete;
+ QualifiedIdTypeJoinableIndex& operator=(const QualifiedIdTypeJoinableIndex&) =
delete;
- QualifiedIdTypeJoinableCache(QualifiedIdTypeJoinableCache&&) = delete;
- QualifiedIdTypeJoinableCache& operator=(QualifiedIdTypeJoinableCache&&) =
+ QualifiedIdTypeJoinableIndex(QualifiedIdTypeJoinableIndex&&) = delete;
+ QualifiedIdTypeJoinableIndex& operator=(QualifiedIdTypeJoinableIndex&&) =
delete;
- ~QualifiedIdTypeJoinableCache() override;
+ ~QualifiedIdTypeJoinableIndex() override;
- // Puts a new data into cache: DocJoinInfo (DocumentId, JoinablePropertyId)
+ // Puts a new data into index: DocJoinInfo (DocumentId, JoinablePropertyId)
// references to ref_document_id.
//
// Returns:
@@ -119,8 +132,50 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage {
libtextclassifier3::StatusOr<DocumentId> Get(
const DocJoinInfo& doc_join_info) const;
+ // Reduces internal file sizes by reclaiming space and ids of deleted
+ // documents. Qualified id type joinable index will convert all entries to the
+ // new document ids.
+ //
+ // - document_id_old_to_new: a map for converting old document id to new
+ // document id.
+ // - new_last_added_document_id: will be used to update the last added
+ // document id in the qualified id type joinable
+ // index.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error. This could potentially leave the index in
+ // an invalid state and the caller should handle it properly (e.g. discard
+ // and rebuild)
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id);
+
+ // Clears all data and set last_added_document_id to kInvalidDocumentId.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status Clear();
+
+ int32_t size() const { return document_to_qualified_id_mapper_->num_keys(); }
+
+ bool empty() const { return size() == 0; }
+
+ DocumentId last_added_document_id() const {
+ return info().last_added_document_id;
+ }
+
+ void set_last_added_document_id(DocumentId document_id) {
+ Info& info_ref = info();
+ if (info_ref.last_added_document_id == kInvalidDocumentId ||
+ document_id > info_ref.last_added_document_id) {
+ info_ref.last_added_document_id = document_id;
+ }
+ }
+
private:
- explicit QualifiedIdTypeJoinableCache(
+ explicit QualifiedIdTypeJoinableIndex(
const Filesystem& filesystem, std::string&& working_path,
std::unique_ptr<uint8_t[]> metadata_buffer,
std::unique_ptr<KeyMapper<DocumentId>> key_mapper)
@@ -130,14 +185,25 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage {
document_to_qualified_id_mapper_(std::move(key_mapper)) {}
static libtextclassifier3::StatusOr<
- std::unique_ptr<QualifiedIdTypeJoinableCache>>
+ std::unique_ptr<QualifiedIdTypeJoinableIndex>>
InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path);
static libtextclassifier3::StatusOr<
- std::unique_ptr<QualifiedIdTypeJoinableCache>>
+ std::unique_ptr<QualifiedIdTypeJoinableIndex>>
InitializeExistingFiles(const Filesystem& filesystem,
std::string&& working_path);
+ // Transfers qualified id type joinable index data from the current to
+ // new_index and convert to new document id according to
+ // document_id_old_to_new. It is a helper function for Optimize.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ QualifiedIdTypeJoinableIndex* new_index) const;
+
// Flushes contents of metadata file.
//
// Returns:
@@ -193,10 +259,10 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage {
// qualified id string).
std::unique_ptr<KeyMapper<DocumentId>> document_to_qualified_id_mapper_;
- // TODO(b/263890397): add delete propagation storage
+ // TODO(b/268521214): add delete propagation storage
};
} // namespace lib
} // namespace icing
-#endif // ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_CACHE_H_
+#endif // ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_
diff --git a/icing/join/qualified-id-type-joinable-index_test.cc b/icing/join/qualified-id-type-joinable-index_test.cc
new file mode 100644
index 0000000..6cbc9e4
--- /dev/null
+++ b/icing/join/qualified-id-type-joinable-index_test.cc
@@ -0,0 +1,739 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id-type-joinable-index.h"
+
+#include <memory>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/join/doc-join-info.h"
+#include "icing/store/document-id.h"
+#include "icing/store/persistent-hash-map-key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::IsTrue;
+using ::testing::Lt;
+using ::testing::Ne;
+using ::testing::Not;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+
+using Crcs = PersistentStorage::Crcs;
+using Info = QualifiedIdTypeJoinableIndex::Info;
+
+static constexpr int32_t kCorruptedValueOffset = 3;
+
+class QualifiedIdTypeJoinableIndexTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/icing";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ working_path_ = base_dir_ + "/qualified_id_type_joinable_index_test";
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+ std::string working_path_;
+};
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidWorkingPath) {
+ EXPECT_THAT(
+ QualifiedIdTypeJoinableIndex::Create(
+ filesystem_, "/dev/null/qualified_id_type_joinable_index_test"),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, InitializeNewFiles) {
+ {
+ // Create new qualified id type joinable index
+ ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Metadata file should be initialized correctly for both info and crcs
+ // sections.
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m");
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize);
+ ASSERT_THAT(
+ filesystem_.PRead(metadata_file_path.c_str(), metadata_buffer.get(),
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Check info section
+ const Info* info = reinterpret_cast<const Info*>(
+ metadata_buffer.get() +
+ QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset);
+ EXPECT_THAT(info->magic, Eq(Info::kMagic));
+ EXPECT_THAT(info->last_added_document_id, Eq(kInvalidDocumentId));
+
+ // Check crcs section
+ const Crcs* crcs = reinterpret_cast<const Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset);
+ // There are some initial info in KeyMapper, so storages_crc should be
+ // non-zero.
+ EXPECT_THAT(crcs->component_crcs.storages_crc, Ne(0));
+ EXPECT_THAT(crcs->component_crcs.info_crc,
+ Eq(Crc32(std::string_view(reinterpret_cast<const char*>(info),
+ sizeof(Info)))
+ .Get()));
+ EXPECT_THAT(crcs->all_crc,
+ Eq(Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs->component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get()));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest,
+ InitializationShouldFailWithoutPersistToDiskOrDestruction) {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ // Insert some data.
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_document_id=*/0));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
+ /*ref_document_id=*/2));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
+ /*ref_document_id=*/4));
+
+ // Without calling PersistToDisk, checksums will not be recomputed or synced
+ // to disk, so initializing another instance on the same files should fail.
+ EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest,
+ InitializationShouldSucceedWithPersistToDisk) {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index1,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ // Insert some data.
+ ICING_ASSERT_OK(
+ index1->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_document_id=*/0));
+ ICING_ASSERT_OK(
+ index1->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
+ /*ref_document_id=*/2));
+ ICING_ASSERT_OK(
+ index1->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
+ /*ref_document_id=*/4));
+ ASSERT_THAT(index1, Pointee(SizeIs(3)));
+
+ // After calling PersistToDisk, all checksums should be recomputed and synced
+ // correctly to disk, so initializing another instance on the same files
+ // should succeed, and we should be able to get the same contents.
+ ICING_EXPECT_OK(index1->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index2,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ EXPECT_THAT(index2, Pointee(SizeIs(3)));
+ EXPECT_THAT(
+ index2->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)),
+ IsOkAndHolds(0));
+ EXPECT_THAT(
+ index2->Get(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20)),
+ IsOkAndHolds(2));
+ EXPECT_THAT(
+ index2->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20)),
+ IsOkAndHolds(4));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest,
+ InitializationShouldSucceedAfterDestruction) {
+ {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ // Insert some data.
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_document_id=*/0));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
+ /*ref_document_id=*/2));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
+ /*ref_document_id=*/4));
+ ASSERT_THAT(index, Pointee(SizeIs(3)));
+ }
+
+ {
+ // The previous instance went out of scope and was destructed. Although we
+ // didn't call PersistToDisk explicitly, the destructor should invoke it and
+ // thus initializing another instance on the same files should succeed, and
+ // we should be able to get the same contents.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+ EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/1,
+ /*joinable_property_id=*/20)),
+ IsOkAndHolds(0));
+ EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/3,
+ /*joinable_property_id=*/20)),
+ IsOkAndHolds(2));
+ EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/5,
+ /*joinable_property_id=*/20)),
+ IsOkAndHolds(4));
+ }
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest,
+ InitializeExistingFilesWithDifferentMagicShouldFail) {
+ {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_document_id=*/0));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ // Manually change magic and update checksum
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize);
+ ASSERT_THAT(
+ filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Manually change magic and update checksums.
+ Crcs* crcs = reinterpret_cast<Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset);
+ Info* info = reinterpret_cast<Info*>(
+ metadata_buffer.get() +
+ QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset);
+ info->magic += kCorruptedValueOffset;
+ crcs->component_crcs.info_crc = info->ComputeChecksum().Get();
+ crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get();
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id type joinable index with different
+ // magic. This should fail.
+ EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Incorrect magic value")));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest,
+ InitializeExistingFilesWithWrongAllCrcShouldFail) {
+ {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_document_id=*/0));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize);
+ ASSERT_THAT(
+ filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Manually corrupt all_crc
+ Crcs* crcs = reinterpret_cast<Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset);
+ crcs->all_crc += kCorruptedValueOffset;
+
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id type joinable index with metadata
+ // containing corrupted all_crc. This should fail.
+ EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid all crc")));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest,
+ InitializeExistingFilesWithCorruptedInfoShouldFail) {
+ {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_document_id=*/0));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize);
+ ASSERT_THAT(
+ filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Modify info, but don't update the checksum. This would be similar to
+ // corruption of info.
+ Info* info = reinterpret_cast<Info*>(
+ metadata_buffer.get() +
+ QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset);
+ info->last_added_document_id += kCorruptedValueOffset;
+
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdTypeJoinableIndex::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id type joinable index with info that
+ // doesn't match its checksum. This should fail.
+ EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid info crc")));
+}
+
+TEST_F(
+ QualifiedIdTypeJoinableIndexTest,
+ InitializeExistingFilesWithCorruptedDocumentToQualifiedIdMapperShouldFail) {
+ {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_document_id=*/0));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ // Corrupt document_to_qualified_id_mapper manually.
+ std::string mapper_working_path = absl_ports::StrCat(
+ working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix,
+ "_mapper");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMapKeyMapper<DocumentId>> mapper,
+ PersistentHashMapKeyMapper<DocumentId>::Create(
+ filesystem_, std::move(mapper_working_path)));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, mapper->ComputeChecksum());
+ ICING_ASSERT_OK(mapper->Put("foo", 12345));
+ ICING_ASSERT_OK(mapper->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, mapper->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ // Attempt to create the qualified id type joinable index with corrupted
+ // document_to_qualified_id_mapper. This should fail.
+ EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid storages crc")));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidPut) {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ DocJoinInfo default_invalid;
+ EXPECT_THAT(index->Put(default_invalid, /*ref_document_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidGet) {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ DocJoinInfo default_invalid;
+ EXPECT_THAT(index->Get(default_invalid),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, PutAndGet) {
+ DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20);
+ DocumentId ref_document1 = 0;
+
+ DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/13);
+ DocumentId ref_document2 = 2;
+
+ DocJoinInfo target_info3(/*document_id=*/4, /*joinable_property_id=*/4);
+ DocumentId ref_document3 = ref_document1;
+
+ {
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ EXPECT_THAT(index->Put(target_info1, /*ref_document_id=*/ref_document1),
+ IsOk());
+ EXPECT_THAT(index->Put(target_info2, /*ref_document_id=*/ref_document2),
+ IsOk());
+ EXPECT_THAT(index->Put(target_info3, /*ref_document_id=*/ref_document3),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+
+ EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_document1));
+ EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_document2));
+ EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_document3));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Verify we can get all of them after destructing and re-initializing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+ EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_document1));
+ EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_document2));
+ EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_document3));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest,
+ GetShouldReturnNotFoundErrorIfNotExist) {
+ DocJoinInfo target_info(/*document_id=*/1, /*joinable_property_id=*/20);
+ DocumentId ref_document = 0;
+
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ // Verify entry is not found in the beginning.
+ EXPECT_THAT(index->Get(target_info),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK(index->Put(target_info, /*ref_document_id=*/ref_document));
+ ASSERT_THAT(index->Get(target_info), IsOkAndHolds(ref_document));
+
+ // Get another non-existing entry. This should get NOT_FOUND_ERROR.
+ DocJoinInfo another_target_info(/*document_id=*/2,
+ /*joinable_property_id=*/20);
+ EXPECT_THAT(index->Get(another_target_info),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, SetLastAddedDocumentId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ constexpr DocumentId kDocumentId = 100;
+ index->set_last_added_document_id(kDocumentId);
+ EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+
+ constexpr DocumentId kNextDocumentId = 123;
+ index->set_last_added_document_id(kNextDocumentId);
+ EXPECT_THAT(index->last_added_document_id(), Eq(kNextDocumentId));
+}
+
+TEST_F(
+ QualifiedIdTypeJoinableIndexTest,
+ SetLastAddedDocumentIdShouldIgnoreNewDocumentIdNotGreaterThanTheCurrent) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ constexpr DocumentId kDocumentId = 123;
+ index->set_last_added_document_id(kDocumentId);
+ ASSERT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+
+ constexpr DocumentId kNextDocumentId = 100;
+ ASSERT_THAT(kNextDocumentId, Lt(kDocumentId));
+ index->set_last_added_document_id(kNextDocumentId);
+ // last_added_document_id() should remain unchanged.
+ EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, Optimize) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10),
+ /*ref_document_id=*/0));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/3),
+ /*ref_document_id=*/0));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/8, /*joinable_property_id=*/9),
+ /*ref_document_id=*/2));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/13, /*joinable_property_id=*/4),
+ /*ref_document_id=*/12));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/21, /*joinable_property_id=*/12),
+ /*ref_document_id=*/12));
+ index->set_last_added_document_id(21);
+
+ ASSERT_THAT(index, Pointee(SizeIs(5)));
+
+ // Used doc id: 0, 2, 3, 5, 8, 12, 13, 21.
+ // Delete doc id = 2, 5, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId);
+ document_id_old_to_new[0] = 0;
+ document_id_old_to_new[3] = 1;
+ document_id_old_to_new[8] = 2;
+ document_id_old_to_new[12] = 3;
+ document_id_old_to_new[13] = 4;
+ document_id_old_to_new[21] = 5;
+
+ DocumentId new_last_added_document_id = 5;
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new, new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id));
+
+ // Verify Put and Get API still work normally after Optimize().
+ // (old_doc_id=3, joinable_property_id=10) had old referenced doc_id = 0,
+ // which is now (doc_id=1, joinable_property_id=10) and referenced doc_id = 0.
+ EXPECT_THAT(
+ index->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/10)),
+ IsOkAndHolds(0));
+
+ // (old_doc_id=5, joinable_property_id=3) had old referenced doc_id = 0,
+ // which is now not found since we've deleted old_doc_id = 5. It is not
+ // testable via Get() because there is no valid doc_id mapping for old_doc_id
+ // = 5 and we cannot generate a valid DocJoinInfo for it.
+
+ // (old_doc_id=8, joinable_property_id=9) had old referenced doc_id = 2,
+ // which is now (doc_id=2, joinable_property_id=9), but since we've deleted
+ // old referenced doc_id = 2, this data should not be found after
+ // optimization.
+ EXPECT_THAT(
+ index->Get(DocJoinInfo(/*document_id=*/2, /*joinable_property_id=*/9)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // (old_doc_id=13, joinable_property_id=4) had old referenced doc_id = 12,
+ // which is now (doc_id=4, joinable_property_id=4) and referenced doc_id = 3.
+ EXPECT_THAT(
+ index->Get(DocJoinInfo(/*document_id=*/4, /*joinable_property_id=*/4)),
+ IsOkAndHolds(3));
+
+ // (old_doc_id=21, joinable_property_id=12) had old referenced doc_id = 12,
+ // which is now (doc_id=5, joinable_property_id=12) and referenced doc_id = 3.
+ EXPECT_THAT(
+ index->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/12)),
+ IsOkAndHolds(3));
+
+ // Joinable index should be able to work normally after Optimize().
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/2),
+ /*ref_document_id=*/90));
+ index->set_last_added_document_id(99);
+
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(99));
+ EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/99,
+ /*joinable_property_id=*/2)),
+ IsOkAndHolds(90));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, OptimizeOutOfRangeDocumentId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/10),
+ /*ref_document_id=*/91));
+ index->set_last_added_document_id(99);
+
+ // Create document_id_old_to_new with size = 1. Optimize should handle out of
+ // range DocumentId properly.
+ std::vector<DocumentId> document_id_old_to_new = {kInvalidDocumentId};
+
+ // There shouldn't be any error due to vector index.
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new,
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, OptimizeDeleteAll) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10),
+ /*ref_document_id=*/0));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/3),
+ /*ref_document_id=*/0));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/8, /*joinable_property_id=*/9),
+ /*ref_document_id=*/2));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/13, /*joinable_property_id=*/4),
+ /*ref_document_id=*/12));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/21, /*joinable_property_id=*/12),
+ /*ref_document_id=*/12));
+ index->set_last_added_document_id(21);
+
+ // Delete all documents.
+ std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId);
+
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new,
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+}
+
+TEST_F(QualifiedIdTypeJoinableIndexTest, Clear) {
+ DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20);
+ DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/5);
+ DocJoinInfo target_info3(/*document_id=*/6, /*joinable_property_id=*/13);
+
+ // Create new qualified id type joinable index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdTypeJoinableIndex> index,
+ QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ ICING_ASSERT_OK(index->Put(target_info1, /*ref_document_id=*/0));
+ ICING_ASSERT_OK(index->Put(target_info2, /*ref_document_id=*/2));
+ ICING_ASSERT_OK(index->Put(target_info3, /*ref_document_id=*/4));
+ ASSERT_THAT(index, Pointee(SizeIs(3)));
+ index->set_last_added_document_id(6);
+ ASSERT_THAT(index->last_added_document_id(), Eq(6));
+
+ // After resetting, last_added_document_id should be set to
+ // kInvalidDocumentId, and the previous added data should be deleted.
+ EXPECT_THAT(index->Clear(), IsOk());
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(index->Get(target_info1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info3),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Joinable index should be able to work normally after Clear().
+ DocJoinInfo target_info4(/*document_id=*/2, /*joinable_property_id=*/19);
+ ICING_ASSERT_OK(index->Put(target_info4, /*ref_document_id=*/0));
+ index->set_last_added_document_id(2);
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(2));
+ EXPECT_THAT(index->Get(target_info4), IsOkAndHolds(0));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ index.reset();
+
+ // Verify index after reconstructing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index, QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_));
+ EXPECT_THAT(index->last_added_document_id(), Eq(2));
+ EXPECT_THAT(index->Get(target_info1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info3),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info4), IsOkAndHolds(0));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/lexer_test.cc b/icing/query/advanced_query_parser/lexer_test.cc
index c6d215c..ec0e663 100644
--- a/icing/query/advanced_query_parser/lexer_test.cc
+++ b/icing/query/advanced_query_parser/lexer_test.cc
@@ -661,6 +661,19 @@ TEST(LexerTest, WhiteSpacesDoNotAffectColonTokenization) {
EqualsLexerToken("h", Lexer::TokenType::TEXT)));
}
+// For the "bar:baz" part to be treated as a TEXT token in a query like
+// foo:bar:baz, an explicit escape is required, so use foo:bar\:baz instead.
+TEST(LexerTest, ColonInTextRequiresExplicitEscaping) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo:bar\\:baz", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("bar:baz", Lexer::TokenType::TEXT)));
+}
+
TEST(LexerTest, QueryShouldRejectTokensBeyondLimit) {
std::string query;
for (int i = 0; i < Lexer::kMaxNumTokens + 1; ++i) {
diff --git a/icing/query/advanced_query_parser/query-visitor.cc b/icing/query/advanced_query_parser/query-visitor.cc
index 9df1264..a1a9c38 100644
--- a/icing/query/advanced_query_parser/query-visitor.cc
+++ b/icing/query/advanced_query_parser/query-visitor.cc
@@ -344,8 +344,10 @@ QueryVisitor::PopPendingIterator() {
return CreateTermIterator(std::move(string_value));
} else {
ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> token_itr,
- tokenizer_.Tokenize(text_value.term));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<Tokenizer::Iterator> token_itr,
+ tokenizer_.Tokenize(text_value.term,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::string normalized_term;
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
// The tokenizer will produce 1+ tokens out of the text. The prefix operator
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 1176eaf..a94775d 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -488,7 +488,10 @@ void GetEntriesFromProperty(const PropertyProto* current_property,
current_property->string_values_size(), /*index=*/i, property_path));
std::string_view value = current_property->string_values(i);
std::unique_ptr<Tokenizer::Iterator> iterator =
- tokenizer->Tokenize(value).ValueOrDie();
+ tokenizer
+ ->Tokenize(value,
+ LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
// All iterators are moved through positions sequentially. Constructing them
// each time resets them to the beginning of the string. This means that,
// for t tokens and in a string of n chars, each MoveToUtf8 call from the
diff --git a/icing/schema/joinable-property-manager.cc b/icing/schema/joinable-property-manager.cc
index 5f8f7b8..3977b6b 100644
--- a/icing/schema/joinable-property-manager.cc
+++ b/icing/schema/joinable-property-manager.cc
@@ -16,6 +16,8 @@
#include <memory>
#include <string>
+#include <string_view>
+#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -36,15 +38,16 @@ namespace {
// Helper function to append a new joinable property metadata
libtextclassifier3::Status AppendNewJoinablePropertyMetadata(
- std::vector<JoinablePropertyMetadata>* metadata_list,
+ JoinablePropertyManager::JoinablePropertyMetadataListWrapper*
+ metadata_list_wrapper,
std::string&& concatenated_path,
PropertyConfigProto::DataType::Code data_type,
JoinableConfig::ValueType::Code value_type) {
// Validates next joinable property id, makes sure that joinable property id
// is the same as the list index so that we could find any joinable property
// metadata by id in O(1) later.
- JoinablePropertyId new_id =
- static_cast<JoinablePropertyId>(metadata_list->size());
+ JoinablePropertyId new_id = static_cast<JoinablePropertyId>(
+ metadata_list_wrapper->metadata_list.size());
if (!IsJoinablePropertyIdValid(new_id)) {
// Max number of joinable properties reached
return absl_ports::OutOfRangeError(
@@ -54,8 +57,10 @@ libtextclassifier3::Status AppendNewJoinablePropertyMetadata(
}
// Creates joinable property metadata
- metadata_list->push_back(JoinablePropertyMetadata(
+ metadata_list_wrapper->metadata_list.push_back(JoinablePropertyMetadata(
new_id, data_type, value_type, std::move(concatenated_path)));
+ metadata_list_wrapper->property_path_to_id_map.insert(
+ {metadata_list_wrapper->metadata_list.back().path, new_id});
return libtextclassifier3::Status::OK;
}
@@ -84,7 +89,8 @@ JoinablePropertyManager::Builder::ProcessSchemaTypePropertyConfig(
SchemaTypeId schema_type_id, const PropertyConfigProto& property_config,
std::string&& property_path) {
if (schema_type_id < 0 ||
- schema_type_id >= joinable_property_metadata_cache_.size()) {
+ schema_type_id >=
+ static_cast<int64_t>(joinable_property_metadata_cache_.size())) {
return absl_ports::InvalidArgumentError("Invalid schema type id");
}
@@ -139,10 +145,33 @@ JoinablePropertyManager::ExtractJoinableProperties(
libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
JoinablePropertyManager::GetJoinablePropertyMetadata(
+ SchemaTypeId schema_type_id, const std::string& property_path) const {
+ if (schema_type_id < 0 ||
+ schema_type_id >=
+ static_cast<int64_t>(joinable_property_metadata_cache_.size())) {
+ return absl_ports::InvalidArgumentError("Invalid schema type id");
+ }
+
+ const auto iter = joinable_property_metadata_cache_[schema_type_id]
+ .property_path_to_id_map.find(property_path);
+ if (iter == joinable_property_metadata_cache_[schema_type_id]
+ .property_path_to_id_map.end()) {
+ return absl_ports::NotFoundError(
+ "Property path is not joinable or doesn't exist");
+ }
+
+ JoinablePropertyId joinable_property_id = iter->second;
+ return &joinable_property_metadata_cache_[schema_type_id]
+ .metadata_list[joinable_property_id];
+}
+
+libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+JoinablePropertyManager::GetJoinablePropertyMetadata(
SchemaTypeId schema_type_id,
JoinablePropertyId joinable_property_id) const {
if (schema_type_id < 0 ||
- schema_type_id >= joinable_property_metadata_cache_.size()) {
+ schema_type_id >=
+ static_cast<int64_t>(joinable_property_metadata_cache_.size())) {
return absl_ports::InvalidArgumentError("Invalid schema type id");
}
if (!IsJoinablePropertyIdValid(joinable_property_id)) {
@@ -150,9 +179,9 @@ JoinablePropertyManager::GetJoinablePropertyMetadata(
"Invalid joinable property id %d", joinable_property_id));
}
- const std::vector<JoinablePropertyMetadata>& joinable_property_metadatas =
- joinable_property_metadata_cache_[schema_type_id];
- if (joinable_property_id >= joinable_property_metadatas.size()) {
+ const std::vector<JoinablePropertyMetadata>& metadata_list =
+ joinable_property_metadata_cache_[schema_type_id].metadata_list;
+ if (joinable_property_id >= metadata_list.size()) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Joinable property with id %d doesn't exist in type config id %d",
joinable_property_id, schema_type_id));
@@ -160,7 +189,7 @@ JoinablePropertyManager::GetJoinablePropertyMetadata(
// The index of metadata list is the same as the joinable property id, so we
// can use joinable property id as the index.
- return &joinable_property_metadatas[joinable_property_id];
+ return &metadata_list[joinable_property_id];
}
libtextclassifier3::StatusOr<const std::vector<JoinablePropertyMetadata>*>
@@ -168,7 +197,7 @@ JoinablePropertyManager::GetMetadataList(
const std::string& type_config_name) const {
ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
schema_type_mapper_.Get(type_config_name));
- return &joinable_property_metadata_cache_.at(schema_type_id);
+ return &joinable_property_metadata_cache_.at(schema_type_id).metadata_list;
}
} // namespace lib
diff --git a/icing/schema/joinable-property-manager.h b/icing/schema/joinable-property-manager.h
index a175ae4..c7038ce 100644
--- a/icing/schema/joinable-property-manager.h
+++ b/icing/schema/joinable-property-manager.h
@@ -17,6 +17,7 @@
#include <memory>
#include <string>
+#include <unordered_map>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -34,6 +35,13 @@ namespace lib {
// from documents.
class JoinablePropertyManager {
public:
+ // A wrapper class that contains a vector of metadatas and property path to
+ // JoinablePropertyId reverse lookup map.
+ struct JoinablePropertyMetadataListWrapper {
+ std::vector<JoinablePropertyMetadata> metadata_list;
+ std::unordered_map<std::string, JoinablePropertyId> property_path_to_id_map;
+ };
+
// Builder class to create a JoinablePropertyManager which does not take
// ownership of any input components, and all pointers must refer to valid
// objects that outlive the created JoinablePropertyManager instance.
@@ -66,7 +74,7 @@ class JoinablePropertyManager {
private:
const KeyMapper<SchemaTypeId>& schema_type_mapper_; // Does not own.
- std::vector<std::vector<JoinablePropertyMetadata>>
+ std::vector<JoinablePropertyMetadataListWrapper>
joinable_property_metadata_cache_;
};
@@ -87,11 +95,23 @@ class JoinablePropertyManager {
libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties(
const DocumentProto& document) const;
+ // Returns the JoinablePropertyMetadata associated with property_path that's
+ // in the SchemaTypeId.
+ //
+ // Returns:
+ // - Valid pointer to JoinablePropertyMetadata on success
+ // - INVALID_ARGUMENT_ERROR if schema type id is invalid
+ // - NOT_FOUND_ERROR if property_path doesn't exist (or is not joinable) in
+ // the joinable metadata list of the schema
+ libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+ GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
+ const std::string& property_path) const;
+
// Returns the JoinablePropertyMetadata associated with the JoinablePropertyId
// that's in the SchemaTypeId.
//
// Returns:
- // - Pointer to JoinablePropertyMetadata on success
+ // - Valid pointer to JoinablePropertyMetadata on success
// - INVALID_ARGUMENT_ERROR if schema type id or JoinablePropertyId is
// invalid
libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
@@ -108,7 +128,7 @@ class JoinablePropertyManager {
private:
explicit JoinablePropertyManager(
const KeyMapper<SchemaTypeId>& schema_type_mapper,
- std::vector<std::vector<JoinablePropertyMetadata>>&&
+ std::vector<JoinablePropertyMetadataListWrapper>&&
joinable_property_metadata_cache)
: schema_type_mapper_(schema_type_mapper),
joinable_property_metadata_cache_(joinable_property_metadata_cache) {}
@@ -117,16 +137,20 @@ class JoinablePropertyManager {
const KeyMapper<SchemaTypeId>& schema_type_mapper_; // Does not own
// The index of joinable_property_metadata_cache_ corresponds to a schema
- // type's SchemaTypeId. At that SchemaTypeId index, we store an inner vector.
- // The inner vector's index corresponds to a joinable property's
- // JoinablePropertyId. At the JoinablePropertyId index, we store the
- // JoinablePropertyMetadata of that joinable property.
+ // type's SchemaTypeId. At that SchemaTypeId index, we store a
+ // JoinablePropertyMetadataListWrapper instance. The metadata list's index
+ // corresponds to a joinable property's JoinablePropertyId. At the
+ // JoinablePropertyId index, we store the JoinablePropertyMetadata of that
+ // joinable property.
//
// For example, suppose "email" has a SchemaTypeId of 0 and it has a joinable
// property called "senderQualifiedId" with a JoinablePropertyId of 1. Then
// the "senderQualifiedId" property's JoinablePropertyMetadata will be at
- // joinable_property_metadata_cache_[0][1].
- const std::vector<std::vector<JoinablePropertyMetadata>>
+ // joinable_property_metadata_cache_[0].metadata_list[1], and
+ // joinable_property_metadata_cache_[0]
+ // .property_path_to_id_map["senderQualifiedId"]
+ // will be 1.
+ const std::vector<JoinablePropertyMetadataListWrapper>
joinable_property_metadata_cache_;
};
diff --git a/icing/schema/joinable-property-manager_test.cc b/icing/schema/joinable-property-manager_test.cc
index 495c254..d9a3841 100644
--- a/icing/schema/joinable-property-manager_test.cc
+++ b/icing/schema/joinable-property-manager_test.cc
@@ -410,6 +410,94 @@ TEST_F(JoinablePropertyManagerTest,
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
+TEST_F(JoinablePropertyManagerTest, GetJoinablePropertyMetadataByPath) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Email (joinable property id -> joinable property path):
+ // 0 -> receiverQualifiedId
+ // 1 -> senderQualifiedId
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ "receiverQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"receiverQualifiedId",
+ CreateReceiverQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ "senderQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1, /*expected_property_path=*/"senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()))));
+
+ // Conversation (joinable property id -> joinable property path):
+ // 0 -> emails.receiverQualifiedId
+ // 1 -> emails.senderQualifiedId
+ // 2 -> groupQualifiedId
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ "emails.receiverQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0,
+ /*expected_property_path=*/"emails.receiverQualifiedId",
+ CreateReceiverQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ "emails.senderQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1,
+ /*expected_property_path=*/"emails.senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ "groupQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/2, /*expected_property_path=*/"groupQualifiedId",
+ CreateGroupQualifiedIdPropertyConfig()))));
+}
+
+TEST_F(JoinablePropertyManagerTest,
+ GetJoinablePropertyMetadataByPathInvalidSchemaTypeId) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+ ASSERT_THAT(type_config_map_, SizeIs(2));
+
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/-1,
+ "receiverQualifiedId"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/2,
+ "receiverQualifiedId"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(JoinablePropertyManagerTest, GetJoinablePropertyMetadataByPathNotExist) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0, "nonExistingPath"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ "emails.nonExistingPath"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
// Note: valid GetMetadataList has been tested in
// JoinablePropertyManagerBuildTest.
TEST_F(JoinablePropertyManagerTest, GetMetadataListInvalidSchemaTypeName) {
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index 79ec49a..0e0c917 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -35,6 +35,7 @@
#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/storage.pb.h"
+#include "icing/schema/joinable-property.h"
#include "icing/schema/schema-type-manager.h"
#include "icing/schema/schema-util.h"
#include "icing/schema/section.h"
@@ -533,6 +534,21 @@ libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
return schema_type_manager_->section_manager().ExtractSections(document);
}
+libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+SchemaStore::GetJoinablePropertyMetadata(
+ SchemaTypeId schema_type_id, const std::string& property_path) const {
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
+ return schema_type_manager_->joinable_property_manager()
+ .GetJoinablePropertyMetadata(schema_type_id, property_path);
+}
+
+libtextclassifier3::StatusOr<JoinablePropertyGroup>
+SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
+ return schema_type_manager_->joinable_property_manager()
+ .ExtractJoinableProperties(document);
+}
+
libtextclassifier3::Status SchemaStore::PersistToDisk() {
if (!has_schema_successfully_set_) {
return libtextclassifier3::Status::OK;
diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h
index 8b85fc8..601d22a 100644
--- a/icing/schema/schema-store.h
+++ b/icing/schema/schema-store.h
@@ -31,6 +31,7 @@
#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/storage.pb.h"
+#include "icing/schema/joinable-property.h"
#include "icing/schema/schema-type-manager.h"
#include "icing/schema/schema-util.h"
#include "icing/schema/section.h"
@@ -196,16 +197,16 @@ class SchemaStore {
// SchemaTypeId.
//
// Returns:
- // pointer to SectionMetadata on success
+ // Valid pointer to SectionMetadata on success
// FAILED_PRECONDITION if schema hasn't been set yet
- // INVALID_ARGUMENT if schema type id or section is invalid
+ // INVALID_ARGUMENT if schema type id or section id is invalid
libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
SchemaTypeId schema_type_id, SectionId section_id) const;
// Extracts all sections of different types from the given document and group
// them by type.
// - Each Section vector is sorted by section Id in ascending order. The
- // sorted section Ids may not be continuous, since not all section Ids are
+ // sorted section ids may not be continuous, since not all sections are
// present in the document.
// - Sections with empty content won't be returned.
// - For example, we may extract:
@@ -219,6 +220,34 @@ class SchemaStore {
libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
const DocumentProto& document) const;
+ // Returns the JoinablePropertyMetadata associated with property_path that's
+ // in the SchemaTypeId.
+ //
+ // Returns:
+ // Valid pointer to JoinablePropertyMetadata on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
+ // INVALID_ARGUMENT if schema type id is invalid
+ // NOT_FOUND if property_path doesn't exist (or is not joinable) in the
+ // joinable metadata list of the schema
+ libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+ GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
+ const std::string& property_path) const;
+
+ // Extracts all joinable property contents of different types from the given
+ // document and group them by joinable value type.
+ // - Joinable properties are sorted by joinable property id in ascending
+ // order. The sorted joinable property ids may not be continuous, since not
+ // all joinable properties are present in the document.
+ // - Joinable property ids start from 0.
+ // - Joinable properties with empty content won't be returned.
+ //
+ // Returns:
+ // A JoinablePropertyGroup instance on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
+ // NOT_FOUND if the type config name of document not found
+ libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties(
+ const DocumentProto& document) const;
+
// Syncs all the data changes to disk.
//
// Returns:
diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc
index 749fcaa..4e2724f 100644
--- a/icing/schema/schema-store_test.cc
+++ b/icing/schema/schema-store_test.cc
@@ -320,6 +320,9 @@ TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) {
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
EXPECT_THAT(store->GetSectionMetadata(/*schema_type_id=*/0, /*section_id=*/0),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(store->GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ /*property_path=*/"A"),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
// The apis to extract content from a document should fail gracefully.
DocumentProto doc;
@@ -329,6 +332,8 @@ TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) {
EXPECT_THAT(store->ExtractSections(doc),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(store->ExtractJoinableProperties(doc),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
// The apis to persist and checksum data should succeed.
EXPECT_THAT(store->ComputeChecksum(), IsOkAndHolds(Crc32()));
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index dc7b0a4..fd790cf 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -325,14 +325,15 @@ IcuLanguageSegmenter::IcuLanguageSegmenter(std::string locale)
: locale_(std::move(locale)) {}
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
-IcuLanguageSegmenter::Segment(const std::string_view text) const {
+IcuLanguageSegmenter::Segment(const std::string_view text,
+ LanguageSegmenter::AccessType) const {
return IcuLanguageSegmenterIterator::Create(text, locale_);
}
libtextclassifier3::StatusOr<std::vector<std::string_view>>
IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const {
ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
- Segment(text));
+ IcuLanguageSegmenterIterator::Create(text, locale_));
std::vector<std::string_view> terms;
while (iterator->Advance()) {
terms.push_back(iterator->GetTerm());
diff --git a/icing/tokenization/icu/icu-language-segmenter.h b/icing/tokenization/icu/icu-language-segmenter.h
index 4115461..f9cfbcb 100644
--- a/icing/tokenization/icu/icu-language-segmenter.h
+++ b/icing/tokenization/icu/icu-language-segmenter.h
@@ -55,7 +55,7 @@ class IcuLanguageSegmenter : public LanguageSegmenter {
// An iterator of terms on success
// INTERNAL_ERROR if any error occurs
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
- Segment(std::string_view text) const override;
+ Segment(std::string_view text, LanguageSegmenter::AccessType) const override;
// The segmentation depends on the language detected in the input text.
//
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 6771050..c88b992 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -419,8 +419,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
// iterator is done.
text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
"Hello", " ", "World");
- ICING_ASSERT_OK_AND_ASSIGN(auto itr,
- language_segmenter->Segment(text_with_spaces));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto itr,
+ language_segmenter->Segment(
+ text_with_spaces, LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> terms;
while (itr->Advance()) {
terms.push_back(itr->GetTerm());
@@ -516,8 +518,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartUtf32WordConnector) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "com.google.android is package";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "com.google.android is package"
// ^ ^^ ^^
@@ -533,8 +537,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStartUtf32) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -550,8 +556,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -568,8 +576,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -588,8 +598,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStartUtf32) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -607,8 +619,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "package com.google.android name";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "package com.google.android name"
// ^ ^^ ^^
@@ -630,8 +644,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32OutOfBounds) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -661,13 +677,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
@@ -683,13 +701,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
@@ -705,13 +725,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
@@ -731,13 +753,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_and_reset_terms =
GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
@@ -754,13 +778,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_and_reset_terms =
GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
@@ -777,13 +803,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_and_reset_terms =
GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
@@ -800,7 +828,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment("How are you你好吗お元気ですか"));
+ language_segmenter->Segment(
+ "How are you你好吗お元気ですか",
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -837,8 +867,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
constexpr std::string_view kTextWithSpace = "Hello World";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kTextWithSpace));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kTextWithSpace, LanguageSegmenter::AccessType::kForwardIterator));
// String: "Hello World"
// ^ ^ ^
@@ -877,8 +909,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) {
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
// don't have whitespaces as word delimiter. Chinese
constexpr std::string_view kChinese = "我每天走路去上班。";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kChinese));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kChinese, LanguageSegmenter::AccessType::kForwardIterator));
// String: "我每天走路去上班。"
// ^ ^ ^ ^^ ^
// UTF-8 idx: 0 3 9 15 18 24
@@ -904,8 +938,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Japanese
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kJapanese));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kJapanese, LanguageSegmenter::AccessType::kForwardIterator));
// String: "私は毎日仕事に歩いています。"
// ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
// UTF-8 idx: 0 3 6 12 18212427 33 39
@@ -930,8 +966,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kKhmer));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kKhmer, LanguageSegmenter::AccessType::kForwardIterator));
// String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
// ^ ^ ^ ^ ^
// UTF-8 idx: 0 9 24 45 69
@@ -957,8 +995,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfterUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Thai
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kThai));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kThai, LanguageSegmenter::AccessType::kForwardIterator));
// String: "ฉันเดินไปทำงานทุกวัน"
// ^ ^ ^ ^ ^ ^
// UTF-8 idx: 0 9 21 27 42 51
@@ -983,8 +1023,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "package name com.google.android!";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "package name com.google.android!"
// ^ ^^ ^^ ^
@@ -1006,8 +1048,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBoundsUtf32) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -1037,13 +1081,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
@@ -1061,13 +1107,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
@@ -1084,13 +1132,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
@@ -1107,7 +1157,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment("How are you你好吗お元気ですか"));
+ language_segmenter->Segment(
+ "How are you你好吗お元気ですか",
+ LanguageSegmenter::AccessType::kForwardIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -1145,8 +1197,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
constexpr std::string_view kTextWithSpace = "Hello World";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kTextWithSpace));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kTextWithSpace, LanguageSegmenter::AccessType::kForwardIterator));
// String: "Hello World"
// ^ ^ ^
@@ -1184,8 +1238,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBeforeUtf32) {
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
// don't have whitespaces as word delimiter. Chinese
constexpr std::string_view kChinese = "我每天走路去上班。";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kChinese));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kChinese, LanguageSegmenter::AccessType::kForwardIterator));
// String: "我每天走路去上班。"
// ^ ^ ^ ^^
// UTF-8 idx: 0 3 9 15 18
@@ -1208,8 +1264,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBeforeUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Japanese
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kJapanese));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kJapanese, LanguageSegmenter::AccessType::kForwardIterator));
// String: "私は毎日仕事に歩いています。"
// ^ ^ ^ ^ ^ ^ ^ ^ ^
// UTF-8 idx: 0 3 6 12 18212427 33
@@ -1231,8 +1289,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBeforeUtf32) {
language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kKhmer));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kKhmer, LanguageSegmenter::AccessType::kForwardIterator));
// String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
// ^ ^ ^ ^
// UTF-8 idx: 0 9 24 45
@@ -1255,8 +1315,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBeforeUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Thai
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kThai));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kThai, LanguageSegmenter::AccessType::kForwardIterator));
// String: "ฉันเดินไปทำงานทุกวัน"
// ^ ^ ^ ^ ^ ^
// UTF-8 idx: 0 9 21 27 42 51
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index 3aff45c..b14ce19 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -54,8 +54,10 @@ TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
- language_segmenter->Segment("foo bar"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ "foo bar", LanguageSegmenter::AccessType::kForwardIterator));
EXPECT_TRUE(iterator->Advance());
EXPECT_THAT(iterator->GetTerm(), Eq("foo"));
@@ -76,8 +78,10 @@ TEST_F(LanguageSegmenterIteratorTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
- language_segmenter->Segment("foo bar"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator));
EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/0),
IsOkAndHolds(3)); // The term " "
@@ -94,8 +98,10 @@ TEST_F(LanguageSegmenterIteratorTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
- language_segmenter->Segment("foo bar"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator));
EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-1), IsOk());
@@ -113,7 +119,10 @@ TEST_F(LanguageSegmenterIteratorTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ text, LanguageSegmenter::AccessType::kBidirectionalIterator));
EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/text.length()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
@@ -127,7 +136,10 @@ TEST_F(LanguageSegmenterIteratorTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ text, LanguageSegmenter::AccessType::kBidirectionalIterator));
EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/100),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
@@ -140,8 +152,10 @@ TEST_F(LanguageSegmenterIteratorTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
- language_segmenter->Segment("foo bar"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator));
EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/6),
IsOkAndHolds(3)); // The term " "
@@ -158,8 +172,10 @@ TEST_F(LanguageSegmenterIteratorTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
- language_segmenter->Segment("foo bar"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator));
// Zero is a valid argument, but there aren't any terms that end before it.
EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/0),
@@ -173,8 +189,10 @@ TEST_F(LanguageSegmenterIteratorTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
- language_segmenter->Segment("foo bar"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator));
EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
@@ -191,7 +209,10 @@ TEST_F(LanguageSegmenterIteratorTest,
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(std::move(options)));
- ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator,
+ language_segmenter->Segment(
+ text, LanguageSegmenter::AccessType::kBidirectionalIterator));
EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length()),
IsOk());
diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h
index 913386a..83a47d4 100644
--- a/icing/tokenization/language-segmenter.h
+++ b/icing/tokenization/language-segmenter.h
@@ -38,6 +38,11 @@ namespace lib {
// segmenter->GetAllTerms(text));
class LanguageSegmenter {
public:
+ enum class AccessType {
+ kForwardIterator,
+ kBidirectionalIterator,
+ };
+
virtual ~LanguageSegmenter() = default;
// An iterator helping to find terms in the input text.
@@ -165,7 +170,7 @@ class LanguageSegmenter {
// outlives the returned iterator.
virtual libtextclassifier3::StatusOr<
std::unique_ptr<LanguageSegmenter::Iterator>>
- Segment(std::string_view text) const = 0;
+ Segment(std::string_view text, AccessType access_type) const = 0;
// Segments and returns all terms in the input text.
//
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index 748a322..50c625e 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -68,7 +68,10 @@ void BM_SegmentNoSpace(benchmark::State& state) {
for (auto _ : state) {
std::unique_ptr<LanguageSegmenter::Iterator> iterator =
- language_segmenter->Segment(input_string).ValueOrDie();
+ language_segmenter
+ ->Segment(input_string,
+ LanguageSegmenter::AccessType::kForwardIterator)
+ .ValueOrDie();
while (iterator->Advance()) {
iterator->GetTerm();
}
@@ -108,7 +111,10 @@ void BM_SegmentWithSpaces(benchmark::State& state) {
for (auto _ : state) {
std::unique_ptr<LanguageSegmenter::Iterator> iterator =
- language_segmenter->Segment(input_string).ValueOrDie();
+ language_segmenter
+ ->Segment(input_string,
+ LanguageSegmenter::AccessType::kForwardIterator)
+ .ValueOrDie();
while (iterator->Advance()) {
iterator->GetTerm();
}
@@ -148,7 +154,10 @@ void BM_SegmentCJK(benchmark::State& state) {
for (auto _ : state) {
std::unique_ptr<LanguageSegmenter::Iterator> iterator =
- language_segmenter->Segment(input_string).ValueOrDie();
+ language_segmenter
+ ->Segment(input_string,
+ LanguageSegmenter::AccessType::kForwardIterator)
+ .ValueOrDie();
while (iterator->Advance()) {
iterator->GetTerm();
}
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index d40022b..9175f3a 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -130,17 +130,19 @@ class PlainTokenIterator : public Tokenizer::Iterator {
};
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
-PlainTokenizer::Tokenize(std::string_view text) const {
+PlainTokenizer::Tokenize(std::string_view text,
+ LanguageSegmenter::AccessType access_type) const {
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
- language_segmenter_.Segment(text));
+ language_segmenter_.Segment(text, access_type));
return std::make_unique<PlainTokenIterator>(std::move(base_iterator));
}
libtextclassifier3::StatusOr<std::vector<Token>> PlainTokenizer::TokenizeAll(
std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
- Tokenize(text));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<Tokenizer::Iterator> iterator,
+ Tokenize(text, LanguageSegmenter::AccessType::kForwardIterator));
std::vector<Token> tokens;
while (iterator->Advance()) {
std::vector<Token> batch_tokens = iterator->GetTokens();
diff --git a/icing/tokenization/plain-tokenizer.h b/icing/tokenization/plain-tokenizer.h
index 25b40fd..61a8b5a 100644
--- a/icing/tokenization/plain-tokenizer.h
+++ b/icing/tokenization/plain-tokenizer.h
@@ -33,7 +33,8 @@ class PlainTokenizer : public Tokenizer {
: language_segmenter_(*language_segmenter) {}
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
- std::string_view text) const override;
+ std::string_view text,
+ LanguageSegmenter::AccessType access_type) const override;
libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
std::string_view text) const override;
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index 6c426da..f94a558 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -25,6 +25,7 @@
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "unicode/uloc.h"
@@ -67,8 +68,10 @@ TEST_F(PlainTokenizerTest, NoTokensBeforeAdvancing) {
language_segmenter.get()));
constexpr std::string_view kText = "Hello, world!";
- ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator,
- plain_tokenizer->Tokenize(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto token_iterator,
+ plain_tokenizer->Tokenize(
+ kText, LanguageSegmenter::AccessType::kForwardIterator));
// We should get no tokens if we get the token before advancing.
EXPECT_THAT(token_iterator->GetTokens(), IsEmpty());
@@ -86,8 +89,10 @@ TEST_F(PlainTokenizerTest, LastTokenAfterFullyAdvanced) {
language_segmenter.get()));
constexpr std::string_view kText = "Hello, world!";
- ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator,
- plain_tokenizer->Tokenize(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto token_iterator,
+ plain_tokenizer->Tokenize(
+ kText, LanguageSegmenter::AccessType::kForwardIterator));
while (token_iterator->Advance()) {}
@@ -344,7 +349,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) {
language_segmenter.get()));
constexpr std::string_view kText = "f b";
- auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
+ auto iterator =
+ plain_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0));
EXPECT_THAT(iterator->GetTokens(),
@@ -365,7 +373,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) {
language_segmenter.get()));
constexpr std::string_view kText = "f b";
- auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
+ auto iterator =
+ plain_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2));
EXPECT_THAT(iterator->GetTokens(),
@@ -412,7 +423,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
"bat", // 16: " bat"
};
- auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
+ auto iterator =
+ plain_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
EXPECT_TRUE(iterator->Advance());
EXPECT_THAT(iterator->GetTokens(),
ElementsAre(EqualsToken(Token::Type::REGULAR, "foo")));
@@ -466,7 +480,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
"foo", // 4: "foo "
};
- auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
+ auto iterator =
+ plain_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
EXPECT_TRUE(iterator->Advance());
EXPECT_THAT(iterator->GetTokens(),
ElementsAre(EqualsToken(Token::Type::REGULAR, "foo")));
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 1dcbf9b..aca317c 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -690,7 +690,8 @@ class RawQueryTokenIterator : public Tokenizer::Iterator {
} // namespace
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
-RawQueryTokenizer::Tokenize(std::string_view text) const {
+RawQueryTokenizer::Tokenize(std::string_view text,
+ LanguageSegmenter::AccessType) const {
ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens, TokenizeAll(text));
return std::make_unique<RawQueryTokenIterator>(std::move(tokens));
}
diff --git a/icing/tokenization/raw-query-tokenizer.h b/icing/tokenization/raw-query-tokenizer.h
index 6316e45..1087b04 100644
--- a/icing/tokenization/raw-query-tokenizer.h
+++ b/icing/tokenization/raw-query-tokenizer.h
@@ -33,7 +33,7 @@ class RawQueryTokenizer : public Tokenizer {
: language_segmenter_(*language_segmenter) {}
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
- std::string_view text) const override;
+ std::string_view text, LanguageSegmenter::AccessType) const override;
libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
std::string_view text) const override;
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index a00f2f7..2044f95 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -21,6 +21,7 @@
#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
#include "unicode/uloc.h"
@@ -60,8 +61,10 @@ TEST_F(RawQueryTokenizerTest, NoTokensBeforeAdvancing) {
language_segmenter.get()));
constexpr std::string_view kText = "Hello, world!";
- ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator,
- raw_query_tokenizer->Tokenize(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto token_iterator,
+ raw_query_tokenizer->Tokenize(
+ kText, LanguageSegmenter::AccessType::kForwardIterator));
// We should get no tokens if we get the token before advancing.
EXPECT_THAT(token_iterator->GetTokens(), IsEmpty());
diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
index dbd7f5a..4bb7991 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
@@ -31,18 +31,13 @@
namespace icing {
namespace lib {
-namespace {
-// Chosen based on results in go/reverse-jni-benchmarks
-static constexpr int kBatchSize = 100;
-} // namespace
-
// -----------------------------------------------------------------------------
// Implementations that call out to JVM. Behold the beauty.
// -----------------------------------------------------------------------------
libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
ReverseJniBreakIterator::Create(const JniCache* jni_cache,
- std::string_view text,
- std::string_view locale) {
+ std::string_view text, std::string_view locale,
+ int batch_size) {
if (jni_cache == nullptr) {
return absl_ports::InvalidArgumentError(
"Create must be called with a valid JniCache pointer!");
@@ -90,15 +85,17 @@ ReverseJniBreakIterator::Create(const JniCache* jni_cache,
ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod(
jenv, iterator_batcher.get(), jni_cache->breakiterator_settext,
java_text.get()));
- return std::unique_ptr<ReverseJniBreakIterator>(
- new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher)));
+ return std::unique_ptr<ReverseJniBreakIterator>(new ReverseJniBreakIterator(
+ jni_cache, std::move(iterator_batcher), batch_size));
}
ReverseJniBreakIterator::ReverseJniBreakIterator(
const JniCache* jni_cache,
- libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher)
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher,
+ int batch_size)
: jni_cache_(jni_cache),
iterator_batcher_(std::move(iterator_batcher)),
+ batch_size_(batch_size),
is_done_(false),
is_almost_done_(false) {}
@@ -113,7 +110,7 @@ int ReverseJniBreakIterator::Next() {
is_done_ = true;
return ReverseJniBreakIterator::kDone;
}
- is_almost_done_ = break_indices_cache_.size() < kBatchSize;
+ is_almost_done_ = break_indices_cache_.size() < batch_size_;
}
int break_index = break_indices_cache_.front();
break_indices_cache_.pop();
@@ -156,7 +153,7 @@ int ReverseJniBreakIterator::FetchNextBatch() {
libtextclassifier3::ScopedLocalRef<jintArray> break_indices,
libtextclassifier3::JniHelper::CallObjectMethod<jintArray>(
jni_cache_->GetEnv(), iterator_batcher_.get(),
- jni_cache_->breakiterator_next, kBatchSize),
+ jni_cache_->breakiterator_next, batch_size_),
ReverseJniBreakIterator::kDone);
if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) {
return ReverseJniBreakIterator::kDone;
diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
index 537666c..b1dcc87 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
@@ -44,6 +44,9 @@ namespace lib {
// EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8));
class ReverseJniBreakIterator {
public:
+ // Chosen based on results in go/reverse-jni-benchmarks
+ static constexpr int kBatchSize = 100;
+
static constexpr int kDone = -1;
// Creates a ReverseJniBreakiterator with the given text and locale.
@@ -54,7 +57,7 @@ class ReverseJniBreakIterator {
// INTERNAL if unable to create any of the required Java objects
static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>>
Create(const JniCache* jni_cache, std::string_view text,
- std::string_view locale);
+ std::string_view locale, int batch_size);
// Returns the UTF-16 boundary following the current boundary. If the current
// boundary is the last text boundary, it returns
@@ -88,9 +91,10 @@ class ReverseJniBreakIterator {
private:
ReverseJniBreakIterator(
const JniCache* jni_cache,
- libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher);
+ libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher,
+ int batch_size);
- // Fetches the results of up to kBatchSize next calls and stores them in
+ // Fetches the results of up to batch_size next calls and stores them in
// break_indices_cache_. Returns the number of results or kDone if no more
// results could be fetched.
int FetchNextBatch();
@@ -109,9 +113,11 @@ class ReverseJniBreakIterator {
// BreakIteratorBatcher#next.
std::queue<int> break_indices_cache_;
+ int batch_size_;
+
bool is_done_;
- // The last batch was incomplete (< kBatchSize results were returned). The
+ // The last batch was incomplete (< batch_size_ results were returned). The
// next call to BreakIteratorBatcher#next is guaranteed to return an
// empty array. Once the results from the last batch are evicted from
// break_indices_cache, ReverseJniBreakIterator will transition to is_done_.
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index bd80718..e6bcf4b 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -293,18 +293,28 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
};
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
-ReverseJniLanguageSegmenter::Segment(const std::string_view text) const {
+ReverseJniLanguageSegmenter::Segment(
+ const std::string_view text,
+ LanguageSegmenter::AccessType access_type) const {
+ // Only batch if we're only doing forward iteration. Bidirectional iteration
+ // will result in us frequently discarding unconsumed batched word breaks.
+ // Therefore, we won't bother batching them.
+ int batch_size =
+ (access_type == LanguageSegmenter::AccessType::kForwardIterator)
+ ? ReverseJniBreakIterator::kBatchSize
+ : 1;
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<ReverseJniBreakIterator> break_iterator,
- ReverseJniBreakIterator::Create(jni_cache_, text, locale_));
+ ReverseJniBreakIterator::Create(jni_cache_, text, locale_, batch_size));
return std::make_unique<ReverseJniLanguageSegmenterIterator>(
text, std::move(break_iterator));
}
libtextclassifier3::StatusOr<std::vector<std::string_view>>
ReverseJniLanguageSegmenter::GetAllTerms(const std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
- Segment(text));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<LanguageSegmenter::Iterator> iterator,
+ Segment(text, LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> terms;
while (iterator->Advance()) {
terms.push_back(iterator->GetTerm());
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
index 29df4ee..e9f84ad 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
@@ -34,7 +34,8 @@ class ReverseJniLanguageSegmenter : public LanguageSegmenter {
: locale_(std::move(locale)), jni_cache_(jni_cache) {}
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
- Segment(std::string_view text) const override;
+ Segment(std::string_view text,
+ LanguageSegmenter::AccessType access_type) const override;
libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
std::string_view text) const override;
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 47a01fe..be652ff 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -394,8 +394,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) {
// iterator is done.
text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '),
"Hello", " ", "World");
- ICING_ASSERT_OK_AND_ASSIGN(auto itr,
- language_segmenter->Segment(text_with_spaces));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto itr,
+ language_segmenter->Segment(
+ text_with_spaces, LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> terms;
while (itr->Advance()) {
terms.push_back(itr->GetTerm());
@@ -491,8 +493,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartUtf32WordConnector) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "com:google:android is package";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "com:google:android is package"
// ^ ^^ ^^
@@ -508,8 +512,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStartUtf32) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -524,8 +530,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStartUtf32) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -542,8 +550,10 @@ TEST_P(ReverseJniLanguageSegmenterTest,
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -562,8 +572,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStartUtf32) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -581,8 +593,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32WordConnector) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "package com:google:android name";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "package com:google:android name"
// ^ ^^ ^^
@@ -604,8 +618,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32OutOfBounds) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -635,13 +651,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
@@ -657,13 +675,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(
+ kThai, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
@@ -679,13 +699,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(
+ kKorean, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
@@ -705,13 +727,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> advance_and_reset_terms =
GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
@@ -728,13 +752,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(
+ kThai, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> advance_and_reset_terms =
GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
@@ -751,13 +777,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(
+ kKorean, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> advance_and_reset_terms =
GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
@@ -773,7 +801,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfterUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment("How are you你好吗お元気ですか"));
+ language_segmenter->Segment(
+ "How are you你好吗お元気ですか",
+ LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -810,8 +840,11 @@ TEST_P(ReverseJniLanguageSegmenterTest,
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
constexpr std::string_view kTextWithSpace = "Hello World";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kTextWithSpace));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kTextWithSpace,
+ LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "Hello World"
// ^ ^ ^
@@ -850,8 +883,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) {
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
// don't have whitespaces as word delimiter. Chinese
constexpr std::string_view kChinese = "我每天走路去上班。";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kChinese));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kChinese, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "我每天走路去上班。"
// ^ ^ ^ ^^ ^
// UTF-8 idx: 0 3 9 15 18 24
@@ -877,8 +912,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Japanese
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kJapanese));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kJapanese, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "私は毎日仕事に歩いています。"
// ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
// UTF-8 idx: 0 3 6 12 18212427 33 39
@@ -903,8 +940,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kKhmer));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kKhmer, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
// ^ ^ ^ ^ ^
// UTF-8 idx: 0 9 24 45 69
@@ -930,8 +969,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfterUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Thai
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kThai));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kThai, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "ฉันเดินไปทำงานทุกวัน"
// ^ ^ ^ ^ ^ ^
// UTF-8 idx: 0 9 21 27 42 51
@@ -955,8 +996,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnectorUtf32) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "package name com:google:android!";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "package name com:google:android!"
// ^ ^^ ^^ ^
@@ -978,8 +1021,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBoundsUtf32) {
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- segmenter->Segment(kText));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -1009,13 +1054,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(kText,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kText));
+ segmenter->Segment(
+ kText, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
@@ -1033,13 +1080,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(kThai,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kThai));
+ segmenter->Segment(
+ kThai, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
@@ -1056,13 +1105,15 @@ TEST_P(ReverseJniLanguageSegmenterTest,
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(kKorean,
+ LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
- segmenter->Segment(kKorean));
+ segmenter->Segment(
+ kKorean, LanguageSegmenter::AccessType::kBidirectionalIterator));
std::vector<std::string_view> reset_terms =
GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
@@ -1078,7 +1129,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBeforeUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment("How are you你好吗お元気ですか"));
+ language_segmenter->Segment(
+ "How are you你好吗お元気ですか",
+ LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "How are you你好吗お元気ですか"
// ^ ^^ ^^ ^ ^ ^ ^ ^ ^
@@ -1116,8 +1169,11 @@ TEST_P(ReverseJniLanguageSegmenterTest,
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
constexpr std::string_view kTextWithSpace = "Hello World";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kTextWithSpace));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kTextWithSpace,
+ LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "Hello World"
// ^ ^ ^
@@ -1155,8 +1211,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBeforeUtf32) {
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
// don't have whitespaces as word delimiter. Chinese
constexpr std::string_view kChinese = "我每天走路去上班。";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kChinese));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kChinese, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "我每天走路去上班。"
// ^ ^ ^ ^^
// UTF-8 idx: 0 3 9 15 18
@@ -1179,8 +1237,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBeforeUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Japanese
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kJapanese));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kJapanese, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "私は毎日仕事に歩いています。"
// ^ ^ ^ ^ ^ ^ ^ ^ ^
// UTF-8 idx: 0 3 6 12 18212427 33
@@ -1202,8 +1262,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBeforeUtf32) {
language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kKhmer));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kKhmer, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
// ^ ^ ^ ^
// UTF-8 idx: 0 9 24 45
@@ -1226,8 +1288,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBeforeUtf32) {
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Thai
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
- language_segmenter->Segment(kThai));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ language_segmenter->Segment(
+ kThai, LanguageSegmenter::AccessType::kBidirectionalIterator));
// String: "ฉันเดินไปทำงานทุกวัน"
// ^ ^ ^ ^ ^ ^
// UTF-8 idx: 0 9 21 27 42 51
diff --git a/icing/tokenization/rfc822-tokenizer.cc b/icing/tokenization/rfc822-tokenizer.cc
index 13c58c5..35b82ca 100644
--- a/icing/tokenization/rfc822-tokenizer.cc
+++ b/icing/tokenization/rfc822-tokenizer.cc
@@ -778,14 +778,15 @@ class Rfc822TokenIterator : public Tokenizer::Iterator {
};
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
-Rfc822Tokenizer::Tokenize(std::string_view text) const {
+Rfc822Tokenizer::Tokenize(std::string_view text,
+ LanguageSegmenter::AccessType) const {
return std::make_unique<Rfc822TokenIterator>(text);
}
libtextclassifier3::StatusOr<std::vector<Token>> Rfc822Tokenizer::TokenizeAll(
std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
- Tokenize(text));
+ std::unique_ptr<Tokenizer::Iterator> iterator =
+ std::make_unique<Rfc822TokenIterator>(text);
std::vector<Token> tokens;
while (iterator->Advance()) {
std::vector<Token> batch_tokens = iterator->GetTokens();
diff --git a/icing/tokenization/rfc822-tokenizer.h b/icing/tokenization/rfc822-tokenizer.h
index 09e4624..094f1cf 100644
--- a/icing/tokenization/rfc822-tokenizer.h
+++ b/icing/tokenization/rfc822-tokenizer.h
@@ -17,6 +17,7 @@
#include <vector>
+#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/tokenizer.h"
namespace icing {
@@ -25,7 +26,7 @@ namespace lib {
class Rfc822Tokenizer : public Tokenizer {
public:
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
- std::string_view text) const override;
+ std::string_view text, LanguageSegmenter::AccessType) const override;
libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
std::string_view text) const override;
diff --git a/icing/tokenization/rfc822-tokenizer_test.cc b/icing/tokenization/rfc822-tokenizer_test.cc
index f114943..6b95a07 100644
--- a/icing/tokenization/rfc822-tokenizer_test.cc
+++ b/icing/tokenization/rfc822-tokenizer_test.cc
@@ -23,6 +23,7 @@
#include "icing/testing/common-matchers.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
#include "unicode/uloc.h"
namespace icing {
@@ -48,7 +49,10 @@ class Rfc822TokenizerTest : public testing::Test {
TEST_F(Rfc822TokenizerTest, StartingState) {
Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
std::string text = "a@g.c";
- auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie();
+ auto token_iterator =
+ rfc822_tokenizer
+ .Tokenize(text, LanguageSegmenter::AccessType::kForwardIterator)
+ .ValueOrDie();
ASSERT_THAT(token_iterator->GetTokens(), IsEmpty());
ASSERT_TRUE(token_iterator->Advance());
@@ -979,7 +983,10 @@ TEST_F(Rfc822TokenizerTest, Commas) {
TEST_F(Rfc822TokenizerTest, ResetToTokenStartingAfter) {
Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
std::string text = "a@g.c,b@g.c";
- auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie();
+ auto token_iterator =
+ rfc822_tokenizer
+ .Tokenize(text, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
ASSERT_TRUE(token_iterator->Advance());
ASSERT_TRUE(token_iterator->Advance());
@@ -995,7 +1002,10 @@ TEST_F(Rfc822TokenizerTest, ResetToTokenStartingAfter) {
TEST_F(Rfc822TokenizerTest, ResetToTokenEndingBefore) {
Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
std::string text = "a@g.c,b@g.c";
- auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie();
+ auto token_iterator =
+ rfc822_tokenizer
+ .Tokenize(text, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
token_iterator->Advance();
ASSERT_TRUE(token_iterator->ResetToTokenEndingBefore(5));
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index fb7613f..3336266 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -22,6 +22,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/util/character-iterator.h"
@@ -33,8 +34,10 @@ namespace lib {
// iterator or a list of tokens. Example usage:
//
// std::unique_ptr<Tokenizer> tokenizer = GetTokenizer();
-// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter,
-// tokenizer->Tokenize(text));
+// ICING_ASSIGN_OR_RETURN(
+// std::unique_ptr<Tokenizer::Iterator> iter,
+// tokenizer->Tokenize(text,
+// LanguageSegmenter::AccessType::kForwardIterator));
// ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
// tokenizer->TokenizeAll(text));
class Tokenizer {
@@ -76,7 +79,10 @@ class Tokenizer {
// offset. Returns false if there are no valid tokens starting after
// offset.
// Ex.
- // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
+ // auto iterator =
+ // tokenizer.Tokenize("foo bar baz",
+ // LanguageSegmenter::AccessType::kForwardIterator)
+ // .ValueOrDie();
// iterator.ResetToTokenStartingAfter(4);
// // The first full token starting after position 4 (the 'b' in "bar") is
// // "baz".
@@ -89,8 +95,10 @@ class Tokenizer {
// offset. Returns false if there are no valid tokens ending
// before offset.
// Ex.
- // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
- // iterator.ResetToTokenEndingBefore(4);
+ // auto iterator =
+ // tokenizer.Tokenize("foo bar baz",
+ // LanguageSegmenter::AccessType::kForwardIterator)
+ // .ValueOrDie(); // iterator.ResetToTokenEndingBefore(4);
// // The first full token ending before position 4 (the 'b' in "bar") is
// // "foo".
// PrintToken(iterator.GetToken()); // prints "foo"
@@ -111,7 +119,8 @@ class Tokenizer {
// types.
// INTERNAL_ERROR if any other errors occur
virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize(
- std::string_view text) const = 0;
+ std::string_view text,
+ LanguageSegmenter::AccessType access_type) const = 0;
// Tokenizes and returns all tokens in the input text. The input text should
// outlive the returned vector.
diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc
index 9ca611d..cf6d5e3 100644
--- a/icing/tokenization/verbatim-tokenizer.cc
+++ b/icing/tokenization/verbatim-tokenizer.cc
@@ -124,14 +124,15 @@ class VerbatimTokenIterator : public Tokenizer::Iterator {
};
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
-VerbatimTokenizer::Tokenize(std::string_view text) const {
+VerbatimTokenizer::Tokenize(std::string_view text,
+ LanguageSegmenter::AccessType) const {
return std::make_unique<VerbatimTokenIterator>(text);
}
libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll(
std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
- Tokenize(text));
+ std::unique_ptr<Tokenizer::Iterator> iterator =
+ std::make_unique<VerbatimTokenIterator>(text);
std::vector<Token> tokens;
while (iterator->Advance()) {
std::vector<Token> batch = iterator->GetTokens();
diff --git a/icing/tokenization/verbatim-tokenizer.h b/icing/tokenization/verbatim-tokenizer.h
index 8404cf1..c3746af 100644
--- a/icing/tokenization/verbatim-tokenizer.h
+++ b/icing/tokenization/verbatim-tokenizer.h
@@ -20,6 +20,7 @@
#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/tokenizer.h"
namespace icing {
@@ -29,7 +30,7 @@ namespace lib {
class VerbatimTokenizer : public Tokenizer {
public:
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
- std::string_view text) const override;
+ std::string_view text, LanguageSegmenter::AccessType) const override;
libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
std::string_view text) const override;
diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc
index bae69ff..5aeb343 100644
--- a/icing/tokenization/verbatim-tokenizer_test.cc
+++ b/icing/tokenization/verbatim-tokenizer_test.cc
@@ -22,6 +22,7 @@
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/util/character-iterator.h"
@@ -94,7 +95,10 @@ TEST_F(VerbatimTokenizerTest, NoTokensBeforeAdvancing) {
language_segmenter_.get()));
constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+ auto token_iterator =
+ verbatim_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kForwardIterator)
+ .ValueOrDie();
// We should get no tokens if we get the token before advancing.
EXPECT_THAT(token_iterator->GetTokens(), IsEmpty());
@@ -107,7 +111,10 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) {
language_segmenter_.get()));
constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+ auto token_iterator =
+ verbatim_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
// Reset to beginning of verbatim of token. We provide an offset of 13 as it
// is larger than the final index (12) of the verbatim token.
@@ -134,7 +141,10 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) {
language_segmenter_.get()));
constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+ auto token_iterator =
+ verbatim_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
// Get token without resetting
EXPECT_TRUE(token_iterator->Advance());
@@ -159,7 +169,10 @@ TEST_F(VerbatimTokenizerTest, ResetToStart) {
language_segmenter_.get()));
constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+ auto token_iterator =
+ verbatim_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator)
+ .ValueOrDie();
// Get token without resetting
EXPECT_TRUE(token_iterator->Advance());
@@ -179,7 +192,10 @@ TEST_F(VerbatimTokenizerTest, CalculateTokenStart) {
language_segmenter_.get()));
constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+ auto token_iterator =
+ verbatim_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kForwardIterator)
+ .ValueOrDie();
ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator start_character_iterator,
token_iterator->CalculateTokenStart());
@@ -195,7 +211,10 @@ TEST_F(VerbatimTokenizerTest, CalculateTokenEnd) {
language_segmenter_.get()));
constexpr std::string_view kText = "Hello, world!";
- auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+ auto token_iterator =
+ verbatim_tokenizer
+ ->Tokenize(kText, LanguageSegmenter::AccessType::kForwardIterator)
+ .ValueOrDie();
ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator end_character_iterator,
token_iterator->CalculateTokenEndExclusive());
diff --git a/icing/util/tokenized-document.cc b/icing/util/tokenized-document.cc
index facb267..1c11c3c 100644
--- a/icing/util/tokenized-document.cc
+++ b/icing/util/tokenized-document.cc
@@ -44,8 +44,10 @@ libtextclassifier3::StatusOr<std::vector<TokenizedSection>> Tokenize(
section.metadata.tokenizer, language_segmenter));
std::vector<std::string_view> token_sequence;
for (std::string_view subcontent : section.content) {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
- tokenizer->Tokenize(subcontent));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<Tokenizer::Iterator> itr,
+ tokenizer->Tokenize(subcontent,
+ LanguageSegmenter::AccessType::kForwardIterator));
while (itr->Advance()) {
std::vector<Token> batch_tokens = itr->GetTokens();
for (const Token& token : batch_tokens) {
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
index 5838a7b..232fbe0 100644
--- a/synced_AOSP_CL_number.txt
+++ b/synced_AOSP_CL_number.txt
@@ -1 +1 @@
-set(synced_AOSP_CL_number=513864120)
+set(synced_AOSP_CL_number=-514555603)