diff options
51 files changed, 1918 insertions, 934 deletions
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 5321d42..1b193af 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -32,11 +32,14 @@ #include "icing/file/destructible-file.h" #include "icing/file/file-backed-proto.h" #include "icing/file/filesystem.h" +#include "icing/index/data-indexing-handler.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" +#include "icing/index/integer-section-indexing-handler.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/integer-index.h" +#include "icing/index/string-section-indexing-handler.h" #include "icing/join/join-processor.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/portable/endian.h" @@ -982,16 +985,15 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { } DocumentId document_id = document_id_or.ValueOrDie(); - auto index_processor_or = IndexProcessor::Create( - normalizer_.get(), index_.get(), integer_index_.get(), clock_.get()); - if (!index_processor_or.ok()) { - TransformStatus(index_processor_or.status(), result_status); + auto data_indexing_handlers_or = CreateDataIndexingHandlers(); + if (!data_indexing_handlers_or.ok()) { + TransformStatus(data_indexing_handlers_or.status(), result_status); return result_proto; } - std::unique_ptr<IndexProcessor> index_processor = - std::move(index_processor_or).ValueOrDie(); + IndexProcessor index_processor( + std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get()); - auto index_status = index_processor->IndexDocument( + auto index_status = index_processor.IndexDocument( tokenized_document, document_id, put_document_stats); // Getting an internal error from the index could possibly mean that the index // is broken. Try to rebuild the index to recover. @@ -2119,19 +2121,18 @@ IcingSearchEngine::RestoreIndexIfNeeded() { return {libtextclassifier3::Status::OK, false, false}; } + auto data_indexing_handlers_or = CreateDataIndexingHandlers(); + if (!data_indexing_handlers_or.ok()) { + return {data_indexing_handlers_or.status(), + truncate_result.index_needed_restoration, + truncate_result.integer_index_needed_restoration}; + } // By using recovery_mode for IndexProcessor, we're able to replay documents // from smaller document id and it will skip documents that are already been // indexed. - auto index_processor_or = IndexProcessor::Create( - normalizer_.get(), index_.get(), integer_index_.get(), clock_.get(), + IndexProcessor index_processor( + std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get(), /*recovery_mode=*/true); - if (!index_processor_or.ok()) { - return {index_processor_or.status(), - truncate_result.index_needed_restoration, - truncate_result.integer_index_needed_restoration}; - } - std::unique_ptr<IndexProcessor> index_processor = - std::move(index_processor_or).ValueOrDie(); ICING_VLOG(1) << "Restoring index by replaying documents from document id " << truncate_result.first_document_to_reindex @@ -2168,7 +2169,7 @@ IcingSearchEngine::RestoreIndexIfNeeded() { std::move(tokenized_document_or).ValueOrDie()); libtextclassifier3::Status status = - index_processor->IndexDocument(tokenized_document, document_id); + index_processor.IndexDocument(tokenized_document, document_id); if (!status.ok()) { if (!absl_ports::IsDataLoss(status)) { // Real error. Stop recovering and pass it up. @@ -2209,6 +2210,29 @@ libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() { return document_store_->last_added_document_id() != kInvalidDocumentId; } +libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>> +IcingSearchEngine::CreateDataIndexingHandlers() { + std::vector<std::unique_ptr<DataIndexingHandler>> handlers; + + // Term index handler + ICING_ASSIGN_OR_RETURN(std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler, + StringSectionIndexingHandler::Create( + clock_.get(), normalizer_.get(), index_.get())); + handlers.push_back(std::move(string_section_indexing_handler)); + + // Integer index handler + ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler> + integer_section_indexing_handler, + IntegerSectionIndexingHandler::Create( + clock_.get(), integer_index_.get())); + handlers.push_back(std::move(integer_section_indexing_handler)); + + // TODO(b/263890397): add QualifiedIdJoinablePropertyIndexingHandler + + return handlers; +} + libtextclassifier3::StatusOr<IcingSearchEngine::TruncateIndexResult> IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) { // Attempt to truncate term index. diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index da447d5..678fc77 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -26,6 +26,7 @@ #include "icing/absl_ports/mutex.h" #include "icing/absl_ports/thread_annotations.h" #include "icing/file/filesystem.h" +#include "icing/index/data-indexing-handler.h" #include "icing/index/index.h" #include "icing/index/numeric/numeric-index.h" #include "icing/jni/jni-cache.h" @@ -672,6 +673,12 @@ class IcingSearchEngine { libtextclassifier3::StatusOr<bool> LostPreviousSchema() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Helper method to create all types of data indexing handlers to index term, + // integer, and joinable qualified ids. + libtextclassifier3::StatusOr< + std::vector<std::unique_ptr<DataIndexingHandler>>> + CreateDataIndexingHandlers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Helper method to discard parts of (term, integer) indices if they contain // data for document ids greater than last_stored_document_id. // diff --git a/icing/index/section-indexing-handler.h b/icing/index/data-indexing-handler.h index 98efc8f..0061b79 100644 --- a/icing/index/section-indexing-handler.h +++ b/icing/index/data-indexing-handler.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_INDEX_SECTION_INDEXING_HANDLER_H_ -#define ICING_INDEX_SECTION_INDEXING_HANDLER_H_ +#ifndef ICING_INDEX_DATA_INDEXING_HANDLER_H_ +#define ICING_INDEX_DATA_INDEXING_HANDLER_H_ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/proto/logging.pb.h" @@ -24,24 +24,23 @@ namespace icing { namespace lib { -// Parent class for indexing different types of sections in TokenizedDocument. -class SectionIndexingHandler { +// Parent class for indexing different types of data in TokenizedDocument. +class DataIndexingHandler { public: - explicit SectionIndexingHandler(const Clock* clock) : clock_(*clock) {} + explicit DataIndexingHandler(const Clock* clock) : clock_(*clock) {} - virtual ~SectionIndexingHandler() = default; + virtual ~DataIndexingHandler() = default; - // Handles the indexing process: add data (hits) into the specific type index - // (e.g. term index, integer index) for all contents in the corresponding type - // of sections in tokenized_document. + // Handles the indexing process: add data into the specific type index (e.g. + // term index, integer index, qualified id type joinable index) for all + // contents in the corresponding type of data in tokenized_document. // For example, IntegerSectionIndexingHandler::Handle should add data into // integer index for all contents in tokenized_document.integer_sections. // // Also it should handle last added DocumentId properly (based on // recovery_mode_) to avoid adding previously indexed documents. // - // tokenized_document: document object with different types of tokenized - // sections. + // tokenized_document: document object with different types of tokenized data. // document_id: id of the document. // recovery_mode: decides how to handle document_id <= // last_added_document_id. If in recovery_mode, then @@ -60,10 +59,10 @@ class SectionIndexingHandler { bool recovery_mode, PutDocumentStatsProto* put_document_stats) = 0; protected: - const Clock& clock_; + const Clock& clock_; // Does not own. }; } // namespace lib } // namespace icing -#endif // ICING_INDEX_SECTION_INDEXING_HANDLER_H_ +#endif // ICING_INDEX_DATA_INDEXING_HANDLER_H_ diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index 86a0826..34988f5 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -21,44 +21,21 @@ #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" -#include "icing/index/index.h" -#include "icing/index/integer-section-indexing-handler.h" -#include "icing/index/numeric/numeric-index.h" -#include "icing/index/string-section-indexing-handler.h" +#include "icing/index/data-indexing-handler.h" #include "icing/proto/logging.pb.h" #include "icing/store/document-id.h" -#include "icing/transform/normalizer.h" #include "icing/util/status-macros.h" #include "icing/util/tokenized-document.h" namespace icing { namespace lib { -libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> -IndexProcessor::Create(const Normalizer* normalizer, Index* index, - NumericIndex<int64_t>* integer_index, const Clock* clock, - bool recovery_mode) { - ICING_RETURN_ERROR_IF_NULL(normalizer); - ICING_RETURN_ERROR_IF_NULL(index); - ICING_RETURN_ERROR_IF_NULL(integer_index); - ICING_RETURN_ERROR_IF_NULL(clock); - - std::vector<std::unique_ptr<SectionIndexingHandler>> handlers; - handlers.push_back( - std::make_unique<StringSectionIndexingHandler>(clock, normalizer, index)); - handlers.push_back( - std::make_unique<IntegerSectionIndexingHandler>(clock, integer_index)); - - return std::unique_ptr<IndexProcessor>( - new IndexProcessor(std::move(handlers), clock, recovery_mode)); -} - libtextclassifier3::Status IndexProcessor::IndexDocument( const TokenizedDocument& tokenized_document, DocumentId document_id, PutDocumentStatsProto* put_document_stats) { // TODO(b/259744228): set overall index latency. - for (auto& section_indexing_handler : section_indexing_handlers_) { - ICING_RETURN_IF_ERROR(section_indexing_handler->Handle( + for (auto& data_indexing_handler : data_indexing_handlers_) { + ICING_RETURN_IF_ERROR(data_indexing_handler->Handle( tokenized_document, document_id, recovery_mode_, put_document_stats)); } diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h index 3d6b19a..9b96f00 100644 --- a/icing/index/index-processor.h +++ b/icing/index/index-processor.h @@ -20,12 +20,9 @@ #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" -#include "icing/index/index.h" -#include "icing/index/numeric/numeric-index.h" -#include "icing/index/section-indexing-handler.h" +#include "icing/index/data-indexing-handler.h" #include "icing/proto/logging.pb.h" #include "icing/store/document-id.h" -#include "icing/transform/normalizer.h" #include "icing/util/tokenized-document.h" namespace icing { @@ -33,24 +30,12 @@ namespace lib { class IndexProcessor { public: - // Factory function to create an IndexProcessor which does not take ownership - // of any input components, and all pointers must refer to valid objects that - // outlive the created IndexProcessor instance. - // - // - recovery_mode: a flag indicates that if IndexProcessor is used to restore - // index. Since there are several indices (term, integer) being restored at - // the same time, we start with the minimum last added DocumentId of all - // indices and replay documents to re-index, so it is possible to get some - // previously indexed documents in the recovery mode. Therefore, we should - // skip them without returning an error in recovery mode. - // - // Returns: - // An IndexProcessor on success - // FAILED_PRECONDITION if any of the pointers is null. - static libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> Create( - const Normalizer* normalizer, Index* index, - NumericIndex<int64_t>* integer_index_, const Clock* clock, - bool recovery_mode = false); + explicit IndexProcessor(std::vector<std::unique_ptr<DataIndexingHandler>>&& + data_indexing_handlers, + const Clock* clock, bool recovery_mode = false) + : data_indexing_handlers_(std::move(data_indexing_handlers)), + clock_(*clock), + recovery_mode_(recovery_mode) {} // Add tokenized document to the index, associated with document_id. If the // number of tokens in the document exceeds max_tokens_per_document, then only @@ -65,22 +50,14 @@ class IndexProcessor { // // Returns: // - OK on success. - // - Any SectionIndexingHandler errors. + // - Any DataIndexingHandler errors. libtextclassifier3::Status IndexDocument( const TokenizedDocument& tokenized_document, DocumentId document_id, PutDocumentStatsProto* put_document_stats = nullptr); private: - explicit IndexProcessor(std::vector<std::unique_ptr<SectionIndexingHandler>>&& - section_indexing_handlers, - const Clock* clock, bool recovery_mode) - : section_indexing_handlers_(std::move(section_indexing_handlers)), - clock_(*clock), - recovery_mode_(recovery_mode) {} - - std::vector<std::unique_ptr<SectionIndexingHandler>> - section_indexing_handlers_; - const Clock& clock_; + std::vector<std::unique_ptr<DataIndexingHandler>> data_indexing_handlers_; + const Clock& clock_; // Does not own. bool recovery_mode_; }; diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc index 6608e44..ee43364 100644 --- a/icing/index/index-processor_benchmark.cc +++ b/icing/index/index-processor_benchmark.cc @@ -12,14 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <memory> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "testing/base/public/benchmark.h" #include "gmock/gmock.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/index/data-indexing-handler.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" -#include "icing/index/numeric/dummy-numeric-index.h" +#include "icing/index/integer-section-indexing-handler.h" +#include "icing/index/numeric/integer-index.h" #include "icing/index/numeric/numeric-index.h" +#include "icing/index/string-section-indexing-handler.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/schema/schema-store.h" #include "icing/schema/schema-util.h" @@ -173,6 +181,24 @@ std::unique_ptr<SchemaStore> CreateSchemaStore(const Filesystem& filesystem, return schema_store; } +libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>> +CreateDataIndexingHandlers(const Clock* clock, const Normalizer* normalizer, + Index* index, NumericIndex<int64_t>* integer_index) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler, + StringSectionIndexingHandler::Create(clock, normalizer, index)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<IntegerSectionIndexingHandler> + integer_section_indexing_handler, + IntegerSectionIndexingHandler::Create(clock, integer_index)); + + std::vector<std::unique_ptr<DataIndexingHandler>> handlers; + handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(integer_section_indexing_handler)); + return handlers; +} + void CleanUp(const Filesystem& filesystem, const std::string& base_dir) { filesystem.DeleteDirectoryRecursively(base_dir.c_str()); } @@ -198,7 +224,7 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) { CreateIndex(icing_filesystem, filesystem, index_dir); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, - DummyNumericIndex<int64_t>::Create(filesystem, integer_index_dir)); + IntegerIndex::Create(filesystem, integer_index_dir)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = language_segmenter_factory::Create(std::move(options)).ValueOrDie(); @@ -206,10 +232,14 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) { Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(filesystem, &clock, base_dir); + ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer.get(), index.get(), integer_index.get(), - &clock)); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers, + CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(), + integer_index.get())); + auto index_processor = + std::make_unique<IndexProcessor>(std::move(handlers), &clock); + DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0)); TokenizedDocument tokenized_document(std::move( TokenizedDocument::Create(schema_store.get(), language_segmenter.get(), @@ -268,7 +298,7 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) { CreateIndex(icing_filesystem, filesystem, index_dir); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, - DummyNumericIndex<int64_t>::Create(filesystem, integer_index_dir)); + IntegerIndex::Create(filesystem, integer_index_dir)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = language_segmenter_factory::Create(std::move(options)).ValueOrDie(); @@ -276,10 +306,13 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) { Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(filesystem, &clock, base_dir); + ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer.get(), index.get(), integer_index.get(), - &clock)); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers, + CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(), + integer_index.get())); + auto index_processor = + std::make_unique<IndexProcessor>(std::move(handlers), &clock); DocumentProto input_document = CreateDocumentWithTenProperties(state.range(0)); @@ -340,7 +373,7 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) { CreateIndex(icing_filesystem, filesystem, index_dir); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, - DummyNumericIndex<int64_t>::Create(filesystem, integer_index_dir)); + IntegerIndex::Create(filesystem, integer_index_dir)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = language_segmenter_factory::Create(std::move(options)).ValueOrDie(); @@ -348,10 +381,13 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) { Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(filesystem, &clock, base_dir); + ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer.get(), index.get(), integer_index.get(), - &clock)); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers, + CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(), + integer_index.get())); + auto index_processor = + std::make_unique<IndexProcessor>(std::move(handlers), &clock); DocumentProto input_document = CreateDocumentWithDiacriticLetters(state.range(0)); @@ -412,7 +448,7 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) { CreateIndex(icing_filesystem, filesystem, index_dir); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, - DummyNumericIndex<int64_t>::Create(filesystem, integer_index_dir)); + IntegerIndex::Create(filesystem, integer_index_dir)); language_segmenter_factory::SegmenterOptions options(ULOC_US); std::unique_ptr<LanguageSegmenter> language_segmenter = language_segmenter_factory::Create(std::move(options)).ValueOrDie(); @@ -420,10 +456,13 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) { Clock clock; std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore(filesystem, &clock, base_dir); + ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer.get(), index.get(), integer_index.get(), - &clock)); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers, + CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(), + integer_index.get())); + auto index_processor = + std::make_unique<IndexProcessor>(std::move(handlers), &clock); DocumentProto input_document = CreateDocumentWithHiragana(state.range(0)); TokenizedDocument tokenized_document(std::move( diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index c22e8f0..3a9b4ee 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -30,12 +30,15 @@ #include "icing/absl_ports/str_join.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/index/data-indexing-handler.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index.h" +#include "icing/index/integer-section-indexing-handler.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/integer-index.h" #include "icing/index/numeric/numeric-index.h" +#include "icing/index/string-section-indexing-handler.h" #include "icing/index/term-property-id.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" @@ -258,9 +261,21 @@ class IndexProcessorTest : public Test { ICING_ASSERT_OK(schema_store_->SetSchema(schema)); ICING_ASSERT_OK_AND_ASSIGN( - index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), - integer_index_.get(), &fake_clock_)); + std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler, + StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), + index_.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler> + integer_section_indexing_handler, + IntegerSectionIndexingHandler::Create( + &fake_clock_, integer_index_.get())); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers; + handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(integer_section_indexing_handler)); + + index_processor_ = + std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); + mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>(); } @@ -290,6 +305,7 @@ class IndexProcessorTest : public Test { std::unique_ptr<LanguageSegmenter> lang_segmenter_; std::unique_ptr<Normalizer> normalizer_; std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<IndexProcessor> index_processor_; }; @@ -315,16 +331,6 @@ std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency( return infos; } -TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) { - EXPECT_THAT(IndexProcessor::Create(/*normalizer=*/nullptr, index_.get(), - integer_index_.get(), &fake_clock_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - - EXPECT_THAT(IndexProcessor::Create(normalizer_.get(), /*index=*/nullptr, - integer_index_.get(), &fake_clock_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); -} - TEST_F(IndexProcessorTest, NoTermMatchTypeContent) { DocumentProto document = DocumentBuilder() @@ -584,10 +590,15 @@ TEST_F(IndexProcessorTest, TooLongTokens) { normalizer_factory::Create( /*max_term_byte_size=*/4)); - ICING_ASSERT_OK_AND_ASSIGN( - index_processor_, - IndexProcessor::Create(normalizer.get(), index_.get(), - integer_index_.get(), &fake_clock_)); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler, + StringSectionIndexingHandler::Create( + &fake_clock_, normalizer.get(), index_.get())); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers; + handlers.push_back(std::move(string_section_indexing_handler)); + + index_processor_ = + std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); DocumentProto document = DocumentBuilder() @@ -769,10 +780,20 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) { TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer_.get(), index_.get(), - integer_index_.get(), &fake_clock_, - /*recovery_mode=*/true)); + std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler, + StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), + index_.get())); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler> + integer_section_indexing_handler, + IntegerSectionIndexingHandler::Create( + &fake_clock_, integer_index_.get())); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers; + handlers.push_back(std::move(string_section_indexing_handler)); + handlers.push_back(std::move(integer_section_indexing_handler)); + + IndexProcessor index_processor(std::move(handlers), &fake_clock_, + /*recovery_mode=*/true); DocumentProto document = DocumentBuilder() @@ -785,7 +806,7 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { TokenizedDocument tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); - EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId1), + EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId1), IsOk()); EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); @@ -808,7 +829,7 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { tokenized_document, TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), document)); - EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId0), + EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId0), IsOk()); // Verify that both index_ and integer_index_ are unchanged. EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); @@ -818,7 +839,7 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { IsOkAndHolds(integer_index_crc)); // As should indexing a document document_id == last_added_document_id. - EXPECT_THAT(index_processor->IndexDocument(tokenized_document, kDocumentId1), + EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId1), IsOk()); // Verify that both index_ and integer_index_ are unchanged. EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); @@ -907,9 +928,16 @@ TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { index_, Index::Create(options, &filesystem_, &icing_filesystem_)); ICING_ASSERT_OK_AND_ASSIGN( - index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), - integer_index_.get(), &fake_clock_)); + std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler, + StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), + index_.get())); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers; + handlers.push_back(std::move(string_section_indexing_handler)); + + index_processor_ = + std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); + DocumentId doc_id = 0; // Have determined experimentally that indexing 3373 documents with this text // will cause the LiteIndex to fill up. Further indexing will fail unless the @@ -964,9 +992,15 @@ TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { Index::Create(options, &filesystem_, mock_icing_filesystem_.get())); ICING_ASSERT_OK_AND_ASSIGN( - index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), - integer_index_.get(), &fake_clock_)); + std::unique_ptr<StringSectionIndexingHandler> + string_section_indexing_handler, + StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), + index_.get())); + std::vector<std::unique_ptr<DataIndexingHandler>> handlers; + handlers.push_back(std::move(string_section_indexing_handler)); + + index_processor_ = + std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); // 3. Index one document. This should fit in the LiteIndex without requiring a // merge. diff --git a/icing/index/integer-section-indexing-handler.cc b/icing/index/integer-section-indexing-handler.cc index 0ed01d3..d201a1a 100644 --- a/icing/index/integer-section-indexing-handler.cc +++ b/icing/index/integer-section-indexing-handler.cc @@ -14,8 +14,11 @@ #include "icing/index/integer-section-indexing-handler.h" +#include <cstdint> +#include <memory> + #include "icing/text_classifier/lib3/utils/base/status.h" -#include "icing/schema/section-manager.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/util/logging.h" @@ -24,6 +27,17 @@ namespace icing { namespace lib { +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<IntegerSectionIndexingHandler>> +IntegerSectionIndexingHandler::Create(const Clock* clock, + NumericIndex<int64_t>* integer_index) { + ICING_RETURN_ERROR_IF_NULL(clock); + ICING_RETURN_ERROR_IF_NULL(integer_index); + + return std::unique_ptr<IntegerSectionIndexingHandler>( + new IntegerSectionIndexingHandler(clock, integer_index)); +} + libtextclassifier3::Status IntegerSectionIndexingHandler::Handle( const TokenizedDocument& tokenized_document, DocumentId document_id, bool recovery_mode, PutDocumentStatsProto* put_document_stats) { diff --git a/icing/index/integer-section-indexing-handler.h b/icing/index/integer-section-indexing-handler.h index d75815c..42ce07e 100644 --- a/icing/index/integer-section-indexing-handler.h +++ b/icing/index/integer-section-indexing-handler.h @@ -15,9 +15,13 @@ #ifndef ICING_INDEX_INTEGER_SECTION_INDEXING_HANDLER_H_ #define ICING_INDEX_INTEGER_SECTION_INDEXING_HANDLER_H_ +#include <cstdint> +#include <memory> + #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/index/data-indexing-handler.h" #include "icing/index/numeric/numeric-index.h" -#include "icing/index/section-indexing-handler.h" #include "icing/store/document-id.h" #include "icing/util/clock.h" #include "icing/util/tokenized-document.h" @@ -25,11 +29,18 @@ namespace icing { namespace lib { -class IntegerSectionIndexingHandler : public SectionIndexingHandler { +class IntegerSectionIndexingHandler : public DataIndexingHandler { public: - explicit IntegerSectionIndexingHandler(const Clock* clock, - NumericIndex<int64_t>* integer_index) - : SectionIndexingHandler(clock), integer_index_(*integer_index) {} + // Creates an IntegerSectionIndexingHandler instance which does not take + // ownership of any input components. All pointers must refer to valid objects + // that outlive the created IntegerSectionIndexingHandler instance. + // + // Returns: + // - An IntegerSectionIndexingHandler instance on success + // - FAILED_PRECONDITION_ERROR if any of the input pointer is null + static libtextclassifier3::StatusOr< + std::unique_ptr<IntegerSectionIndexingHandler>> + Create(const Clock* clock, NumericIndex<int64_t>* integer_index); ~IntegerSectionIndexingHandler() override = default; @@ -46,7 +57,11 @@ class IntegerSectionIndexingHandler : public SectionIndexingHandler { bool recovery_mode, PutDocumentStatsProto* put_document_stats) override; private: - NumericIndex<int64_t>& integer_index_; + explicit IntegerSectionIndexingHandler(const Clock* clock, + NumericIndex<int64_t>* integer_index) + : DataIndexingHandler(clock), integer_index_(*integer_index) {} + + NumericIndex<int64_t>& integer_index_; // Does not own. }; } // namespace lib diff --git a/icing/index/numeric/integer-index.h b/icing/index/numeric/integer-index.h index 98c26ef..050a143 100644 --- a/icing/index/numeric/integer-index.h +++ b/icing/index/numeric/integer-index.h @@ -216,7 +216,7 @@ class IntegerIndex : public NumericIndex<int64_t> { // Returns: // - OK on success // - INTERNAL_ERROR on I/O error. This could potentially leave the storages - // in an invalid state and the caller should handle it property (e.g. + // in an invalid state and the caller should handle it properly (e.g. // discard and rebuild) libtextclassifier3::Status TransferIndex( const std::vector<DocumentId>& document_id_old_to_new, diff --git a/icing/index/string-section-indexing-handler.cc b/icing/index/string-section-indexing-handler.cc index 7cd0909..83a2687 100644 --- a/icing/index/string-section-indexing-handler.cc +++ b/icing/index/string-section-indexing-handler.cc @@ -20,6 +20,7 @@ #include <string_view> #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/index/index.h" #include "icing/legacy/core/icing-string-util.h" @@ -34,6 +35,19 @@ namespace icing { namespace lib { +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<StringSectionIndexingHandler>> +StringSectionIndexingHandler::Create(const Clock* clock, + const Normalizer* normalizer, + Index* index) { + ICING_RETURN_ERROR_IF_NULL(clock); + ICING_RETURN_ERROR_IF_NULL(normalizer); + ICING_RETURN_ERROR_IF_NULL(index); + + return std::unique_ptr<StringSectionIndexingHandler>( + new StringSectionIndexingHandler(clock, normalizer, index)); +} + libtextclassifier3::Status StringSectionIndexingHandler::Handle( const TokenizedDocument& tokenized_document, DocumentId document_id, bool recovery_mode, PutDocumentStatsProto* put_document_stats) { diff --git a/icing/index/string-section-indexing-handler.h b/icing/index/string-section-indexing-handler.h index 36f6a05..6abfba5 100644 --- a/icing/index/string-section-indexing-handler.h +++ b/icing/index/string-section-indexing-handler.h @@ -15,9 +15,12 @@ #ifndef ICING_INDEX_STRING_SECTION_INDEXING_HANDLER_H_ #define ICING_INDEX_STRING_SECTION_INDEXING_HANDLER_H_ +#include <memory> + #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/index/data-indexing-handler.h" #include "icing/index/index.h" -#include "icing/index/section-indexing-handler.h" #include "icing/proto/logging.pb.h" #include "icing/store/document-id.h" #include "icing/transform/normalizer.h" @@ -27,14 +30,18 @@ namespace icing { namespace lib { -class StringSectionIndexingHandler : public SectionIndexingHandler { +class StringSectionIndexingHandler : public DataIndexingHandler { public: - explicit StringSectionIndexingHandler(const Clock* clock, - const Normalizer* normalizer, - Index* index) - : SectionIndexingHandler(clock), - normalizer_(*normalizer), - index_(*index) {} + // Creates a StringSectionIndexingHandler instance which does not take + // ownership of any input components. All pointers must refer to valid objects + // that outlive the created StringSectionIndexingHandler instance. + // + // Returns: + // - A StringSectionIndexingHandler instance on success + // - FAILED_PRECONDITION_ERROR if any of the input pointer is null + static libtextclassifier3::StatusOr< + std::unique_ptr<StringSectionIndexingHandler>> + Create(const Clock* clock, const Normalizer* normalizer, Index* index); ~StringSectionIndexingHandler() override = default; @@ -57,8 +64,13 @@ class StringSectionIndexingHandler : public SectionIndexingHandler { bool recovery_mode, PutDocumentStatsProto* put_document_stats) override; private: - const Normalizer& normalizer_; - Index& index_; + explicit StringSectionIndexingHandler(const Clock* clock, + const Normalizer* normalizer, + Index* index) + : DataIndexingHandler(clock), normalizer_(*normalizer), index_(*index) {} + + const Normalizer& normalizer_; // Does not own. + Index& index_; // Does not own. }; } // namespace lib diff --git a/icing/join/qualified-id-type-joinable-cache_test.cc b/icing/join/qualified-id-type-joinable-cache_test.cc deleted file mode 100644 index 088c878..0000000 --- a/icing/join/qualified-id-type-joinable-cache_test.cc +++ /dev/null @@ -1,496 +0,0 @@ -// Copyright (C) 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/join/qualified-id-type-joinable-cache.h" - -#include <memory> -#include <string> - -#include "icing/text_classifier/lib3/utils/base/status.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/file/filesystem.h" -#include "icing/file/persistent-storage.h" -#include "icing/join/doc-join-info.h" -#include "icing/store/document-id.h" -#include "icing/store/persistent-hash-map-key-mapper.h" -#include "icing/testing/common-matchers.h" -#include "icing/testing/tmp-directory.h" -#include "icing/util/crc32.h" - -namespace icing { -namespace lib { - -namespace { - -using ::testing::Eq; -using ::testing::HasSubstr; -using ::testing::IsTrue; -using ::testing::Ne; -using ::testing::Not; - -using Crcs = PersistentStorage::Crcs; -using Info = QualifiedIdTypeJoinableCache::Info; - -static constexpr int32_t kCorruptedValueOffset = 3; - -class QualifiedIdTypeJoinableCacheTest : public ::testing::Test { - protected: - void SetUp() override { - base_dir_ = GetTestTempDir() + "/icing"; - ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), - IsTrue()); - - working_path_ = base_dir_ + "/qualified_id_type_joinable_cache_test"; - } - - void TearDown() override { - filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); - } - - Filesystem filesystem_; - std::string base_dir_; - std::string working_path_; -}; - -TEST_F(QualifiedIdTypeJoinableCacheTest, InvalidWorkingPath) { - EXPECT_THAT( - QualifiedIdTypeJoinableCache::Create( - filesystem_, "/dev/null/qualified_id_type_joinable_cache_test"), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, InitializeNewFiles) { - { - // Create new qualified id type joinable cache - ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str())); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - - ICING_ASSERT_OK(cache->PersistToDisk()); - } - - // Metadata file should be initialized correctly for both info and crcs - // sections. - const std::string metadata_file_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, ".m"); - auto metadata_buffer = std::make_unique<uint8_t[]>( - QualifiedIdTypeJoinableCache::kMetadataFileSize); - ASSERT_THAT( - filesystem_.PRead(metadata_file_path.c_str(), metadata_buffer.get(), - QualifiedIdTypeJoinableCache::kMetadataFileSize, - /*offset=*/0), - IsTrue()); - - // Check info section - const Info* info = reinterpret_cast<const Info*>( - metadata_buffer.get() + - QualifiedIdTypeJoinableCache::kInfoMetadataBufferOffset); - EXPECT_THAT(info->magic, Eq(Info::kMagic)); - EXPECT_THAT(info->last_added_document_id, Eq(kInvalidDocumentId)); - - // Check crcs section - const Crcs* crcs = reinterpret_cast<const Crcs*>( - metadata_buffer.get() + - QualifiedIdTypeJoinableCache::kCrcsMetadataBufferOffset); - // There are some initial info in KeyMapper, so storages_crc should be - // non-zero. - EXPECT_THAT(crcs->component_crcs.storages_crc, Ne(0)); - EXPECT_THAT(crcs->component_crcs.info_crc, - Eq(Crc32(std::string_view(reinterpret_cast<const char*>(info), - sizeof(Info))) - .Get())); - EXPECT_THAT(crcs->all_crc, - Eq(Crc32(std::string_view( - reinterpret_cast<const char*>(&crcs->component_crcs), - sizeof(Crcs::ComponentCrcs))) - .Get())); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, - InitializationShouldFailWithoutPersistToDiskOrDestruction) { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - - // Insert some data. - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), - /*ref_document_id=*/2)); - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), - /*ref_document_id=*/4)); - - // Without calling PersistToDisk, checksums will not be recomputed or synced - // to disk, so initializing another instance on the same files should fail. - EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, - InitializationShouldSucceedWithPersistToDisk) { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache1, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - - // Insert some data. - ICING_ASSERT_OK( - cache1->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); - ICING_ASSERT_OK( - cache1->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), - /*ref_document_id=*/2)); - ICING_ASSERT_OK( - cache1->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), - /*ref_document_id=*/4)); - - // After calling PersistToDisk, all checksums should be recomputed and synced - // correctly to disk, so initializing another instance on the same files - // should succeed, and we should be able to get the same contents. - ICING_EXPECT_OK(cache1->PersistToDisk()); - - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache2, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - EXPECT_THAT( - cache2->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)), - IsOkAndHolds(0)); - EXPECT_THAT( - cache2->Get(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20)), - IsOkAndHolds(2)); - EXPECT_THAT( - cache2->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20)), - IsOkAndHolds(4)); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, - InitializationShouldSucceedAfterDestruction) { - { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - - // Insert some data. - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), - /*ref_document_id=*/2)); - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), - /*ref_document_id=*/4)); - } - - { - // The previous instance went out of scope and was destructed. Although we - // didn't call PersistToDisk explicitly, the destructor should invoke it and - // thus initializing another instance on the same files should succeed, and - // we should be able to get the same contents. - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - EXPECT_THAT(cache->Get(DocJoinInfo(/*document_id=*/1, - /*joinable_property_id=*/20)), - IsOkAndHolds(0)); - EXPECT_THAT(cache->Get(DocJoinInfo(/*document_id=*/3, - /*joinable_property_id=*/20)), - IsOkAndHolds(2)); - EXPECT_THAT(cache->Get(DocJoinInfo(/*document_id=*/5, - /*joinable_property_id=*/20)), - IsOkAndHolds(4)); - } -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, - InitializeExistingFilesWithDifferentMagicShouldFail) { - { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); - - ICING_ASSERT_OK(cache->PersistToDisk()); - } - - { - // Manually change magic and update checksum - const std::string metadata_file_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, ".m"); - ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); - ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); - - auto metadata_buffer = std::make_unique<uint8_t[]>( - QualifiedIdTypeJoinableCache::kMetadataFileSize); - ASSERT_THAT( - filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), - QualifiedIdTypeJoinableCache::kMetadataFileSize, - /*offset=*/0), - IsTrue()); - - // Manually change magic and update checksums. - Crcs* crcs = reinterpret_cast<Crcs*>( - metadata_buffer.get() + - QualifiedIdTypeJoinableCache::kCrcsMetadataBufferOffset); - Info* info = reinterpret_cast<Info*>( - metadata_buffer.get() + - QualifiedIdTypeJoinableCache::kInfoMetadataBufferOffset); - info->magic += kCorruptedValueOffset; - crcs->component_crcs.info_crc = info->ComputeChecksum().Get(); - crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get(); - ASSERT_THAT(filesystem_.PWrite( - metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), - QualifiedIdTypeJoinableCache::kMetadataFileSize), - IsTrue()); - } - - // Attempt to create the qualified id type joinable cache with different - // magic. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, - HasSubstr("Incorrect magic value"))); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, - InitializeExistingFilesWithWrongAllCrcShouldFail) { - { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); - - ICING_ASSERT_OK(cache->PersistToDisk()); - } - - { - const std::string metadata_file_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, ".m"); - ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); - ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); - - auto metadata_buffer = std::make_unique<uint8_t[]>( - QualifiedIdTypeJoinableCache::kMetadataFileSize); - ASSERT_THAT( - filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), - QualifiedIdTypeJoinableCache::kMetadataFileSize, - /*offset=*/0), - IsTrue()); - - // Manually corrupt all_crc - Crcs* crcs = reinterpret_cast<Crcs*>( - metadata_buffer.get() + - QualifiedIdTypeJoinableCache::kCrcsMetadataBufferOffset); - crcs->all_crc += kCorruptedValueOffset; - - ASSERT_THAT(filesystem_.PWrite( - metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), - QualifiedIdTypeJoinableCache::kMetadataFileSize), - IsTrue()); - } - - // Attempt to create the qualified id type joinable cache with metadata - // containing corrupted all_crc. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, - HasSubstr("Invalid all crc"))); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, - InitializeExistingFilesWithCorruptedInfoShouldFail) { - { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); - - ICING_ASSERT_OK(cache->PersistToDisk()); - } - - { - const std::string metadata_file_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, ".m"); - ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); - ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); - - auto metadata_buffer = std::make_unique<uint8_t[]>( - QualifiedIdTypeJoinableCache::kMetadataFileSize); - ASSERT_THAT( - filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), - QualifiedIdTypeJoinableCache::kMetadataFileSize, - /*offset=*/0), - IsTrue()); - - // Modify info, but don't update the checksum. This would be similar to - // corruption of info. - Info* info = reinterpret_cast<Info*>( - metadata_buffer.get() + - QualifiedIdTypeJoinableCache::kInfoMetadataBufferOffset); - info->last_added_document_id += kCorruptedValueOffset; - - ASSERT_THAT(filesystem_.PWrite( - metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), - QualifiedIdTypeJoinableCache::kMetadataFileSize), - IsTrue()); - } - - // Attempt to create the qualified id type joinable cache with info that - // doesn't match its checksum. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, - HasSubstr("Invalid info crc"))); -} - -TEST_F( - QualifiedIdTypeJoinableCacheTest, - InitializeExistingFilesWithCorruptedDocumentToQualifiedIdMapperShouldFail) { - { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - ICING_ASSERT_OK( - cache->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); - - ICING_ASSERT_OK(cache->PersistToDisk()); - } - - { - // Corrupt document_to_qualified_id_mapper manually. - std::string mapper_working_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableCache::kFilePrefix, - "_mapper"); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PersistentHashMapKeyMapper<DocumentId>> mapper, - PersistentHashMapKeyMapper<DocumentId>::Create( - filesystem_, std::move(mapper_working_path))); - ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, mapper->ComputeChecksum()); - ICING_ASSERT_OK(mapper->Put("foo", 12345)); - ICING_ASSERT_OK(mapper->PersistToDisk()); - ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, mapper->ComputeChecksum()); - ASSERT_THAT(old_crc, Not(Eq(new_crc))); - } - - // Attempt to create the qualified id type joinable cache with corrupted - // document_to_qualified_id_mapper. This should fail. - EXPECT_THAT(QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, - HasSubstr("Invalid storages crc"))); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, InvalidPut) { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - - DocJoinInfo default_invalid; - EXPECT_THAT(cache->Put(default_invalid, /*ref_document_id=*/0), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, InvalidGet) { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - - DocJoinInfo default_invalid; - EXPECT_THAT(cache->Get(default_invalid), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, PutAndGet) { - DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20); - DocumentId ref_document1 = 0; - - DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/13); - DocumentId ref_document2 = 2; - - DocJoinInfo target_info3(/*document_id=*/4, /*joinable_property_id=*/4); - DocumentId ref_document3 = ref_document1; - - { - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - - EXPECT_THAT(cache->Put(target_info1, /*ref_document_id=*/ref_document1), - IsOk()); - EXPECT_THAT(cache->Put(target_info2, /*ref_document_id=*/ref_document2), - IsOk()); - EXPECT_THAT(cache->Put(target_info3, /*ref_document_id=*/ref_document3), - IsOk()); - - EXPECT_THAT(cache->Get(target_info1), IsOkAndHolds(ref_document1)); - EXPECT_THAT(cache->Get(target_info2), IsOkAndHolds(ref_document2)); - EXPECT_THAT(cache->Get(target_info3), IsOkAndHolds(ref_document3)); - - ICING_ASSERT_OK(cache->PersistToDisk()); - } - - // Verify we can get all of them after destructing and re-initializing. - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - EXPECT_THAT(cache->Get(target_info1), IsOkAndHolds(ref_document1)); - EXPECT_THAT(cache->Get(target_info2), IsOkAndHolds(ref_document2)); - EXPECT_THAT(cache->Get(target_info3), IsOkAndHolds(ref_document3)); -} - -TEST_F(QualifiedIdTypeJoinableCacheTest, - GetShouldReturnNotFoundErrorIfNotExist) { - DocJoinInfo target_info(/*document_id=*/1, /*joinable_property_id=*/20); - DocumentId ref_document = 0; - - // Create new qualified id type joinable cache - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<QualifiedIdTypeJoinableCache> cache, - QualifiedIdTypeJoinableCache::Create(filesystem_, working_path_)); - - // Verify entry is not found in the beginning. - EXPECT_THAT(cache->Get(target_info), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - - ICING_ASSERT_OK(cache->Put(target_info, /*ref_document_id=*/ref_document)); - ASSERT_THAT(cache->Get(target_info), IsOkAndHolds(ref_document)); - - // Get another non-existing entry. This should get NOT_FOUND_ERROR. - DocJoinInfo another_target_info(/*document_id=*/2, - /*joinable_property_id=*/20); - EXPECT_THAT(cache->Get(another_target_info), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); -} - -} // namespace - -} // namespace lib -} // namespace icing diff --git a/icing/join/qualified-id-type-joinable-cache.cc b/icing/join/qualified-id-type-joinable-index.cc index 4dc6e5a..231e78a 100644 --- a/icing/join/qualified-id-type-joinable-cache.cc +++ b/icing/join/qualified-id-type-joinable-index.cc @@ -12,16 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/join/qualified-id-type-joinable-cache.h" +#include "icing/join/qualified-id-type-joinable-index.h" #include <memory> #include <string> #include <string_view> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" +#include "icing/file/destructible-directory.h" #include "icing/file/filesystem.h" #include "icing/join/doc-join-info.h" #include "icing/store/document-id.h" @@ -37,42 +39,50 @@ namespace lib { namespace { +DocumentId GetNewDocumentId( + const std::vector<DocumentId>& document_id_old_to_new, + DocumentId old_document_id) { + if (old_document_id >= document_id_old_to_new.size()) { + return kInvalidDocumentId; + } + return document_id_old_to_new[old_document_id]; +} + std::string GetMetadataFilePath(std::string_view working_path) { return absl_ports::StrCat(working_path, "/", - QualifiedIdTypeJoinableCache::kFilePrefix, ".m"); + QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); } std::string GetDocumentToQualifiedIdMapperPath(std::string_view working_path) { return absl_ports::StrCat( - working_path, "/", QualifiedIdTypeJoinableCache::kFilePrefix, "_mapper"); + working_path, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, "_mapper"); } } // namespace /* static */ libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableCache>> -QualifiedIdTypeJoinableCache::Create(const Filesystem& filesystem, + std::unique_ptr<QualifiedIdTypeJoinableIndex>> +QualifiedIdTypeJoinableIndex::Create(const Filesystem& filesystem, std::string working_path) { if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) || !filesystem.DirectoryExists( GetDocumentToQualifiedIdMapperPath(working_path).c_str())) { // Discard working_path if any file/directory is missing, and reinitialize. - ICING_RETURN_IF_ERROR( - PersistentStorage::Discard(filesystem, working_path, kWorkingPathType)); + ICING_RETURN_IF_ERROR(Discard(filesystem, working_path)); return InitializeNewFiles(filesystem, std::move(working_path)); } return InitializeExistingFiles(filesystem, std::move(working_path)); } -QualifiedIdTypeJoinableCache::~QualifiedIdTypeJoinableCache() { +QualifiedIdTypeJoinableIndex::~QualifiedIdTypeJoinableIndex() { if (!PersistToDisk().ok()) { - ICING_LOG(WARNING) << "Failed to persist qualified id type joinable cache " + ICING_LOG(WARNING) << "Failed to persist qualified id type joinable index " "to disk while destructing " << working_path_; } } -libtextclassifier3::Status QualifiedIdTypeJoinableCache::Put( +libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Put( const DocJoinInfo& doc_join_info, DocumentId ref_document_id) { if (!doc_join_info.is_valid()) { return absl_ports::InvalidArgumentError( @@ -82,12 +92,12 @@ libtextclassifier3::Status QualifiedIdTypeJoinableCache::Put( ICING_RETURN_IF_ERROR(document_to_qualified_id_mapper_->Put( encode_util::EncodeIntToCString(doc_join_info.value()), ref_document_id)); - // TODO(b/263890397): add delete propagation + // TODO(b/268521214): add data into delete propagation storage return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<DocumentId> QualifiedIdTypeJoinableCache::Get( +libtextclassifier3::StatusOr<DocumentId> QualifiedIdTypeJoinableIndex::Get( const DocJoinInfo& doc_join_info) const { if (!doc_join_info.is_valid()) { return absl_ports::InvalidArgumentError( @@ -98,9 +108,79 @@ libtextclassifier3::StatusOr<DocumentId> QualifiedIdTypeJoinableCache::Get( encode_util::EncodeIntToCString(doc_join_info.value())); } +libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize( + const std::vector<DocumentId>& document_id_old_to_new, + DocumentId new_last_added_document_id) { + std::string temp_working_path = working_path_ + "_temp"; + ICING_RETURN_IF_ERROR(Discard(filesystem_, temp_working_path)); + + DestructibleDirectory temp_working_path_ddir(&filesystem_, + std::move(temp_working_path)); + if (!temp_working_path_ddir.is_valid()) { + return absl_ports::InternalError( + "Unable to create temp directory to build new qualified id type " + "joinable index"); + } + + { + // Transfer all data from the current to new qualified id type joinable + // index. Also PersistToDisk and destruct the instance after finishing, so + // we can safely swap directories later. + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> new_index, + Create(filesystem_, temp_working_path_ddir.dir())); + ICING_RETURN_IF_ERROR( + TransferIndex(document_id_old_to_new, new_index.get())); + new_index->set_last_added_document_id(new_last_added_document_id); + ICING_RETURN_IF_ERROR(new_index->PersistToDisk()); + } + + // Destruct current index's storage instances to safely swap directories. + // TODO(b/268521214): handle delete propagation storage + document_to_qualified_id_mapper_.reset(); + + if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(), + working_path_.c_str())) { + return absl_ports::InternalError( + "Unable to apply new qualified id type joinable index due to failed " + "swap"); + } + + // Reinitialize qualified id type joinable index. + if (!filesystem_.PRead(GetMetadataFilePath(working_path_).c_str(), + metadata_buffer_.get(), kMetadataFileSize, + /*offset=*/0)) { + return absl_ports::InternalError("Fail to read metadata file"); + } + ICING_ASSIGN_OR_RETURN( + document_to_qualified_id_mapper_, + PersistentHashMapKeyMapper<DocumentId>::Create( + filesystem_, GetDocumentToQualifiedIdMapperPath(working_path_))); + + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Clear() { + document_to_qualified_id_mapper_.reset(); + // Discard and reinitialize document to qualified id mapper. + std::string document_to_qualified_id_mapper_path = + GetDocumentToQualifiedIdMapperPath(working_path_); + ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete( + filesystem_, document_to_qualified_id_mapper_path)); + ICING_ASSIGN_OR_RETURN( + document_to_qualified_id_mapper_, + PersistentHashMapKeyMapper<DocumentId>::Create( + filesystem_, std::move(document_to_qualified_id_mapper_path))); + + // TODO(b/268521214): clear delete propagation storage + + info().last_added_document_id = kInvalidDocumentId; + return libtextclassifier3::Status::OK; +} + /* static */ libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableCache>> -QualifiedIdTypeJoinableCache::InitializeNewFiles(const Filesystem& filesystem, + std::unique_ptr<QualifiedIdTypeJoinableIndex>> +QualifiedIdTypeJoinableIndex::InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path) { // Create working directory. if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) { @@ -116,25 +196,25 @@ QualifiedIdTypeJoinableCache::InitializeNewFiles(const Filesystem& filesystem, filesystem, GetDocumentToQualifiedIdMapperPath(working_path))); // Create instance. - auto new_type_joinable_cache = std::unique_ptr<QualifiedIdTypeJoinableCache>( - new QualifiedIdTypeJoinableCache( + auto new_index = std::unique_ptr<QualifiedIdTypeJoinableIndex>( + new QualifiedIdTypeJoinableIndex( filesystem, std::move(working_path), /*metadata_buffer=*/std::make_unique<uint8_t[]>(kMetadataFileSize), std::move(document_to_qualified_id_mapper))); // Initialize info content. - new_type_joinable_cache->info().magic = Info::kMagic; - new_type_joinable_cache->info().last_added_document_id = kInvalidDocumentId; + new_index->info().magic = Info::kMagic; + new_index->info().last_added_document_id = kInvalidDocumentId; // Initialize new PersistentStorage. The initial checksums will be computed // and set via InitializeNewStorage. Also write them into disk as well. - ICING_RETURN_IF_ERROR(new_type_joinable_cache->InitializeNewStorage()); - ICING_RETURN_IF_ERROR(new_type_joinable_cache->PersistMetadataToDisk()); + ICING_RETURN_IF_ERROR(new_index->InitializeNewStorage()); + ICING_RETURN_IF_ERROR(new_index->PersistMetadataToDisk()); - return new_type_joinable_cache; + return new_index; } /* static */ libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableCache>> -QualifiedIdTypeJoinableCache::InitializeExistingFiles( + std::unique_ptr<QualifiedIdTypeJoinableIndex>> +QualifiedIdTypeJoinableIndex::InitializeExistingFiles( const Filesystem& filesystem, std::string&& working_path) { // PRead metadata file. auto metadata_buffer = std::make_unique<uint8_t[]>(kMetadataFileSize); @@ -151,23 +231,53 @@ QualifiedIdTypeJoinableCache::InitializeExistingFiles( filesystem, GetDocumentToQualifiedIdMapperPath(working_path))); // Create instance. - auto type_joinable_cache = std::unique_ptr<QualifiedIdTypeJoinableCache>( - new QualifiedIdTypeJoinableCache( + auto type_joinable_index = std::unique_ptr<QualifiedIdTypeJoinableIndex>( + new QualifiedIdTypeJoinableIndex( filesystem, std::move(working_path), std::move(metadata_buffer), std::move(document_to_qualified_id_mapper))); // Initialize existing PersistentStorage. Checksums will be validated. - ICING_RETURN_IF_ERROR(type_joinable_cache->InitializeExistingStorage()); + ICING_RETURN_IF_ERROR(type_joinable_index->InitializeExistingStorage()); // Validate magic. - if (type_joinable_cache->info().magic != Info::kMagic) { + if (type_joinable_index->info().magic != Info::kMagic) { return absl_ports::FailedPreconditionError("Incorrect magic value"); } - return type_joinable_cache; + return type_joinable_index; +} + +libtextclassifier3::Status QualifiedIdTypeJoinableIndex::TransferIndex( + const std::vector<DocumentId>& document_id_old_to_new, + QualifiedIdTypeJoinableIndex* new_index) const { + std::unique_ptr<KeyMapper<DocumentId>::Iterator> iter = + document_to_qualified_id_mapper_->GetIterator(); + while (iter->Advance()) { + DocJoinInfo old_doc_join_info( + encode_util::DecodeIntFromCString(iter->GetKey())); + DocumentId old_ref_document_id = iter->GetValue(); + + // Translate to new doc ids. + DocumentId new_document_id = GetNewDocumentId( + document_id_old_to_new, old_doc_join_info.document_id()); + DocumentId new_ref_document_id = + GetNewDocumentId(document_id_old_to_new, old_ref_document_id); + + if (new_document_id != kInvalidDocumentId && + new_ref_document_id != kInvalidDocumentId) { + ICING_RETURN_IF_ERROR( + new_index->Put(DocJoinInfo(new_document_id, + old_doc_join_info.joinable_property_id()), + new_ref_document_id)); + } + } + + // TODO(b/268521214): transfer delete propagation storage + + return libtextclassifier3::Status::OK; } libtextclassifier3::Status -QualifiedIdTypeJoinableCache::PersistMetadataToDisk() { +QualifiedIdTypeJoinableIndex::PersistMetadataToDisk() { std::string metadata_file_path = GetMetadataFilePath(working_path_); ScopedFd sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); @@ -188,17 +298,17 @@ QualifiedIdTypeJoinableCache::PersistMetadataToDisk() { } libtextclassifier3::Status -QualifiedIdTypeJoinableCache::PersistStoragesToDisk() { +QualifiedIdTypeJoinableIndex::PersistStoragesToDisk() { return document_to_qualified_id_mapper_->PersistToDisk(); } libtextclassifier3::StatusOr<Crc32> -QualifiedIdTypeJoinableCache::ComputeInfoChecksum() { +QualifiedIdTypeJoinableIndex::ComputeInfoChecksum() { return info().ComputeChecksum(); } libtextclassifier3::StatusOr<Crc32> -QualifiedIdTypeJoinableCache::ComputeStoragesChecksum() { +QualifiedIdTypeJoinableIndex::ComputeStoragesChecksum() { return document_to_qualified_id_mapper_->ComputeChecksum(); } diff --git a/icing/join/qualified-id-type-joinable-cache.h b/icing/join/qualified-id-type-joinable-index.h index 08f6455..794f33f 100644 --- a/icing/join/qualified-id-type-joinable-cache.h +++ b/icing/join/qualified-id-type-joinable-index.h @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_CACHE_H_ -#define ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_CACHE_H_ +#ifndef ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_ +#define ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_ #include <cstdint> #include <memory> #include <string> #include <string_view> +#include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" @@ -32,9 +33,9 @@ namespace icing { namespace lib { -// QualifiedIdTypeJoinableCache: a class to maintain cache data mapping -// DocJoinInfo to joinable qualified ids and delete propagation info. -class QualifiedIdTypeJoinableCache : public PersistentStorage { +// QualifiedIdTypeJoinableIndex: a class to maintain data mapping DocJoinInfo to +// joinable qualified ids and delete propagation info. +class QualifiedIdTypeJoinableIndex : public PersistentStorage { public: struct Info { static constexpr int32_t kMagic = 0x48cabdc6; @@ -58,16 +59,17 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage { static constexpr WorkingPathType kWorkingPathType = WorkingPathType::kDirectory; - static constexpr std::string_view kFilePrefix = "qualified_id_joinable_cache"; + static constexpr std::string_view kFilePrefix = + "qualified_id_type_joinable_index"; - // Creates a QualifiedIdTypeJoinableCache instance to store qualified ids for + // Creates a QualifiedIdTypeJoinableIndex instance to store qualified ids for // future joining search. If any of the underlying file is missing, then // delete the whole working_path and (re)initialize with new ones. Otherwise // initialize and create the instance by existing files. // // filesystem: Object to make system level calls // working_path: Specifies the working path for PersistentStorage. - // QualifiedIdTypeJoinableCache uses working path as working + // QualifiedIdTypeJoinableIndex uses working path as working // directory and all related files will be stored under this // directory. It takes full ownership and of working_path_, // including creation/deletion. It is the caller's @@ -84,21 +86,32 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage { // - INTERNAL_ERROR on I/O errors // - Any KeyMapper errors static libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableCache>> + std::unique_ptr<QualifiedIdTypeJoinableIndex>> Create(const Filesystem& filesystem, std::string working_path); + // Deletes QualifiedIdTypeJoinableIndex under working_path. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + static libtextclassifier3::Status Discard(const Filesystem& filesystem, + const std::string& working_path) { + return PersistentStorage::Discard(filesystem, working_path, + kWorkingPathType); + } + // Delete copy and move constructor/assignment operator. - QualifiedIdTypeJoinableCache(const QualifiedIdTypeJoinableCache&) = delete; - QualifiedIdTypeJoinableCache& operator=(const QualifiedIdTypeJoinableCache&) = + QualifiedIdTypeJoinableIndex(const QualifiedIdTypeJoinableIndex&) = delete; + QualifiedIdTypeJoinableIndex& operator=(const QualifiedIdTypeJoinableIndex&) = delete; - QualifiedIdTypeJoinableCache(QualifiedIdTypeJoinableCache&&) = delete; - QualifiedIdTypeJoinableCache& operator=(QualifiedIdTypeJoinableCache&&) = + QualifiedIdTypeJoinableIndex(QualifiedIdTypeJoinableIndex&&) = delete; + QualifiedIdTypeJoinableIndex& operator=(QualifiedIdTypeJoinableIndex&&) = delete; - ~QualifiedIdTypeJoinableCache() override; + ~QualifiedIdTypeJoinableIndex() override; - // Puts a new data into cache: DocJoinInfo (DocumentId, JoinablePropertyId) + // Puts a new data into index: DocJoinInfo (DocumentId, JoinablePropertyId) // references to ref_document_id. // // Returns: @@ -119,8 +132,50 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage { libtextclassifier3::StatusOr<DocumentId> Get( const DocJoinInfo& doc_join_info) const; + // Reduces internal file sizes by reclaiming space and ids of deleted + // documents. Qualified id type joinable index will convert all entries to the + // new document ids. + // + // - document_id_old_to_new: a map for converting old document id to new + // document id. + // - new_last_added_document_id: will be used to update the last added + // document id in the qualified id type joinable + // index. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error. This could potentially leave the index in + // an invalid state and the caller should handle it properly (e.g. discard + // and rebuild) + libtextclassifier3::Status Optimize( + const std::vector<DocumentId>& document_id_old_to_new, + DocumentId new_last_added_document_id); + + // Clears all data and set last_added_document_id to kInvalidDocumentId. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + libtextclassifier3::Status Clear(); + + int32_t size() const { return document_to_qualified_id_mapper_->num_keys(); } + + bool empty() const { return size() == 0; } + + DocumentId last_added_document_id() const { + return info().last_added_document_id; + } + + void set_last_added_document_id(DocumentId document_id) { + Info& info_ref = info(); + if (info_ref.last_added_document_id == kInvalidDocumentId || + document_id > info_ref.last_added_document_id) { + info_ref.last_added_document_id = document_id; + } + } + private: - explicit QualifiedIdTypeJoinableCache( + explicit QualifiedIdTypeJoinableIndex( const Filesystem& filesystem, std::string&& working_path, std::unique_ptr<uint8_t[]> metadata_buffer, std::unique_ptr<KeyMapper<DocumentId>> key_mapper) @@ -130,14 +185,25 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage { document_to_qualified_id_mapper_(std::move(key_mapper)) {} static libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableCache>> + std::unique_ptr<QualifiedIdTypeJoinableIndex>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path); static libtextclassifier3::StatusOr< - std::unique_ptr<QualifiedIdTypeJoinableCache>> + std::unique_ptr<QualifiedIdTypeJoinableIndex>> InitializeExistingFiles(const Filesystem& filesystem, std::string&& working_path); + // Transfers qualified id type joinable index data from the current to + // new_index and convert to new document id according to + // document_id_old_to_new. It is a helper function for Optimize. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error + libtextclassifier3::Status TransferIndex( + const std::vector<DocumentId>& document_id_old_to_new, + QualifiedIdTypeJoinableIndex* new_index) const; + // Flushes contents of metadata file. // // Returns: @@ -193,10 +259,10 @@ class QualifiedIdTypeJoinableCache : public PersistentStorage { // qualified id string). std::unique_ptr<KeyMapper<DocumentId>> document_to_qualified_id_mapper_; - // TODO(b/263890397): add delete propagation storage + // TODO(b/268521214): add delete propagation storage }; } // namespace lib } // namespace icing -#endif // ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_CACHE_H_ +#endif // ICING_JOIN_QUALIFIED_ID_TYPE_JOINABLE_INDEX_H_ diff --git a/icing/join/qualified-id-type-joinable-index_test.cc b/icing/join/qualified-id-type-joinable-index_test.cc new file mode 100644 index 0000000..6cbc9e4 --- /dev/null +++ b/icing/join/qualified-id-type-joinable-index_test.cc @@ -0,0 +1,739 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/join/qualified-id-type-joinable-index.h" + +#include <memory> +#include <string> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/file/persistent-storage.h" +#include "icing/join/doc-join-info.h" +#include "icing/store/document-id.h" +#include "icing/store/persistent-hash-map-key-mapper.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/crc32.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::HasSubstr; +using ::testing::IsEmpty; +using ::testing::IsTrue; +using ::testing::Lt; +using ::testing::Ne; +using ::testing::Not; +using ::testing::Pointee; +using ::testing::SizeIs; + +using Crcs = PersistentStorage::Crcs; +using Info = QualifiedIdTypeJoinableIndex::Info; + +static constexpr int32_t kCorruptedValueOffset = 3; + +class QualifiedIdTypeJoinableIndexTest : public ::testing::Test { + protected: + void SetUp() override { + base_dir_ = GetTestTempDir() + "/icing"; + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), + IsTrue()); + + working_path_ = base_dir_ + "/qualified_id_type_joinable_index_test"; + } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + Filesystem filesystem_; + std::string base_dir_; + std::string working_path_; +}; + +TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidWorkingPath) { + EXPECT_THAT( + QualifiedIdTypeJoinableIndex::Create( + filesystem_, "/dev/null/qualified_id_type_joinable_index_test"), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, InitializeNewFiles) { + { + // Create new qualified id type joinable index + ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + EXPECT_THAT(index, Pointee(IsEmpty())); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + // Metadata file should be initialized correctly for both info and crcs + // sections. + const std::string metadata_file_path = absl_ports::StrCat( + working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + auto metadata_buffer = std::make_unique<uint8_t[]>( + QualifiedIdTypeJoinableIndex::kMetadataFileSize); + ASSERT_THAT( + filesystem_.PRead(metadata_file_path.c_str(), metadata_buffer.get(), + QualifiedIdTypeJoinableIndex::kMetadataFileSize, + /*offset=*/0), + IsTrue()); + + // Check info section + const Info* info = reinterpret_cast<const Info*>( + metadata_buffer.get() + + QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset); + EXPECT_THAT(info->magic, Eq(Info::kMagic)); + EXPECT_THAT(info->last_added_document_id, Eq(kInvalidDocumentId)); + + // Check crcs section + const Crcs* crcs = reinterpret_cast<const Crcs*>( + metadata_buffer.get() + + QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset); + // There are some initial info in KeyMapper, so storages_crc should be + // non-zero. + EXPECT_THAT(crcs->component_crcs.storages_crc, Ne(0)); + EXPECT_THAT(crcs->component_crcs.info_crc, + Eq(Crc32(std::string_view(reinterpret_cast<const char*>(info), + sizeof(Info))) + .Get())); + EXPECT_THAT(crcs->all_crc, + Eq(Crc32(std::string_view( + reinterpret_cast<const char*>(&crcs->component_crcs), + sizeof(Crcs::ComponentCrcs))) + .Get())); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, + InitializationShouldFailWithoutPersistToDiskOrDestruction) { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + // Insert some data. + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), + /*ref_document_id=*/0)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), + /*ref_document_id=*/2)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), + /*ref_document_id=*/4)); + + // Without calling PersistToDisk, checksums will not be recomputed or synced + // to disk, so initializing another instance on the same files should fail. + EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, + InitializationShouldSucceedWithPersistToDisk) { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index1, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + // Insert some data. + ICING_ASSERT_OK( + index1->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), + /*ref_document_id=*/0)); + ICING_ASSERT_OK( + index1->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), + /*ref_document_id=*/2)); + ICING_ASSERT_OK( + index1->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), + /*ref_document_id=*/4)); + ASSERT_THAT(index1, Pointee(SizeIs(3))); + + // After calling PersistToDisk, all checksums should be recomputed and synced + // correctly to disk, so initializing another instance on the same files + // should succeed, and we should be able to get the same contents. + ICING_EXPECT_OK(index1->PersistToDisk()); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index2, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + EXPECT_THAT(index2, Pointee(SizeIs(3))); + EXPECT_THAT( + index2->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)), + IsOkAndHolds(0)); + EXPECT_THAT( + index2->Get(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20)), + IsOkAndHolds(2)); + EXPECT_THAT( + index2->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20)), + IsOkAndHolds(4)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, + InitializationShouldSucceedAfterDestruction) { + { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + // Insert some data. + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), + /*ref_document_id=*/0)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), + /*ref_document_id=*/2)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), + /*ref_document_id=*/4)); + ASSERT_THAT(index, Pointee(SizeIs(3))); + } + + { + // The previous instance went out of scope and was destructed. Although we + // didn't call PersistToDisk explicitly, the destructor should invoke it and + // thus initializing another instance on the same files should succeed, and + // we should be able to get the same contents. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + EXPECT_THAT(index, Pointee(SizeIs(3))); + EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/1, + /*joinable_property_id=*/20)), + IsOkAndHolds(0)); + EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/3, + /*joinable_property_id=*/20)), + IsOkAndHolds(2)); + EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/5, + /*joinable_property_id=*/20)), + IsOkAndHolds(4)); + } +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, + InitializeExistingFilesWithDifferentMagicShouldFail) { + { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), + /*ref_document_id=*/0)); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + { + // Manually change magic and update checksum + const std::string metadata_file_path = absl_ports::StrCat( + working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); + + auto metadata_buffer = std::make_unique<uint8_t[]>( + QualifiedIdTypeJoinableIndex::kMetadataFileSize); + ASSERT_THAT( + filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdTypeJoinableIndex::kMetadataFileSize, + /*offset=*/0), + IsTrue()); + + // Manually change magic and update checksums. + Crcs* crcs = reinterpret_cast<Crcs*>( + metadata_buffer.get() + + QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset); + Info* info = reinterpret_cast<Info*>( + metadata_buffer.get() + + QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset); + info->magic += kCorruptedValueOffset; + crcs->component_crcs.info_crc = info->ComputeChecksum().Get(); + crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get(); + ASSERT_THAT(filesystem_.PWrite( + metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), + QualifiedIdTypeJoinableIndex::kMetadataFileSize), + IsTrue()); + } + + // Attempt to create the qualified id type joinable index with different + // magic. This should fail. + EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Incorrect magic value"))); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, + InitializeExistingFilesWithWrongAllCrcShouldFail) { + { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), + /*ref_document_id=*/0)); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + { + const std::string metadata_file_path = absl_ports::StrCat( + working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); + + auto metadata_buffer = std::make_unique<uint8_t[]>( + QualifiedIdTypeJoinableIndex::kMetadataFileSize); + ASSERT_THAT( + filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdTypeJoinableIndex::kMetadataFileSize, + /*offset=*/0), + IsTrue()); + + // Manually corrupt all_crc + Crcs* crcs = reinterpret_cast<Crcs*>( + metadata_buffer.get() + + QualifiedIdTypeJoinableIndex::kCrcsMetadataBufferOffset); + crcs->all_crc += kCorruptedValueOffset; + + ASSERT_THAT(filesystem_.PWrite( + metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), + QualifiedIdTypeJoinableIndex::kMetadataFileSize), + IsTrue()); + } + + // Attempt to create the qualified id type joinable index with metadata + // containing corrupted all_crc. This should fail. + EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Invalid all crc"))); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, + InitializeExistingFilesWithCorruptedInfoShouldFail) { + { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), + /*ref_document_id=*/0)); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + { + const std::string metadata_file_path = absl_ports::StrCat( + working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); + ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); + + auto metadata_buffer = std::make_unique<uint8_t[]>( + QualifiedIdTypeJoinableIndex::kMetadataFileSize); + ASSERT_THAT( + filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(), + QualifiedIdTypeJoinableIndex::kMetadataFileSize, + /*offset=*/0), + IsTrue()); + + // Modify info, but don't update the checksum. This would be similar to + // corruption of info. + Info* info = reinterpret_cast<Info*>( + metadata_buffer.get() + + QualifiedIdTypeJoinableIndex::kInfoMetadataBufferOffset); + info->last_added_document_id += kCorruptedValueOffset; + + ASSERT_THAT(filesystem_.PWrite( + metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(), + QualifiedIdTypeJoinableIndex::kMetadataFileSize), + IsTrue()); + } + + // Attempt to create the qualified id type joinable index with info that + // doesn't match its checksum. This should fail. + EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Invalid info crc"))); +} + +TEST_F( + QualifiedIdTypeJoinableIndexTest, + InitializeExistingFilesWithCorruptedDocumentToQualifiedIdMapperShouldFail) { + { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), + /*ref_document_id=*/0)); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + { + // Corrupt document_to_qualified_id_mapper manually. + std::string mapper_working_path = absl_ports::StrCat( + working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, + "_mapper"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PersistentHashMapKeyMapper<DocumentId>> mapper, + PersistentHashMapKeyMapper<DocumentId>::Create( + filesystem_, std::move(mapper_working_path))); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, mapper->ComputeChecksum()); + ICING_ASSERT_OK(mapper->Put("foo", 12345)); + ICING_ASSERT_OK(mapper->PersistToDisk()); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, mapper->ComputeChecksum()); + ASSERT_THAT(old_crc, Not(Eq(new_crc))); + } + + // Attempt to create the qualified id type joinable index with corrupted + // document_to_qualified_id_mapper. This should fail. + EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Invalid storages crc"))); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidPut) { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + DocJoinInfo default_invalid; + EXPECT_THAT(index->Put(default_invalid, /*ref_document_id=*/0), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidGet) { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + DocJoinInfo default_invalid; + EXPECT_THAT(index->Get(default_invalid), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, PutAndGet) { + DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20); + DocumentId ref_document1 = 0; + + DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/13); + DocumentId ref_document2 = 2; + + DocJoinInfo target_info3(/*document_id=*/4, /*joinable_property_id=*/4); + DocumentId ref_document3 = ref_document1; + + { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + EXPECT_THAT(index->Put(target_info1, /*ref_document_id=*/ref_document1), + IsOk()); + EXPECT_THAT(index->Put(target_info2, /*ref_document_id=*/ref_document2), + IsOk()); + EXPECT_THAT(index->Put(target_info3, /*ref_document_id=*/ref_document3), + IsOk()); + EXPECT_THAT(index, Pointee(SizeIs(3))); + + EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_document1)); + EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_document2)); + EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_document3)); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + // Verify we can get all of them after destructing and re-initializing. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + EXPECT_THAT(index, Pointee(SizeIs(3))); + EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_document1)); + EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_document2)); + EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_document3)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, + GetShouldReturnNotFoundErrorIfNotExist) { + DocJoinInfo target_info(/*document_id=*/1, /*joinable_property_id=*/20); + DocumentId ref_document = 0; + + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + // Verify entry is not found in the beginning. + EXPECT_THAT(index->Get(target_info), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + ICING_ASSERT_OK(index->Put(target_info, /*ref_document_id=*/ref_document)); + ASSERT_THAT(index->Get(target_info), IsOkAndHolds(ref_document)); + + // Get another non-existing entry. This should get NOT_FOUND_ERROR. + DocJoinInfo another_target_info(/*document_id=*/2, + /*joinable_property_id=*/20); + EXPECT_THAT(index->Get(another_target_info), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, SetLastAddedDocumentId) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + constexpr DocumentId kDocumentId = 100; + index->set_last_added_document_id(kDocumentId); + EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId)); + + constexpr DocumentId kNextDocumentId = 123; + index->set_last_added_document_id(kNextDocumentId); + EXPECT_THAT(index->last_added_document_id(), Eq(kNextDocumentId)); +} + +TEST_F( + QualifiedIdTypeJoinableIndexTest, + SetLastAddedDocumentIdShouldIgnoreNewDocumentIdNotGreaterThanTheCurrent) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + constexpr DocumentId kDocumentId = 123; + index->set_last_added_document_id(kDocumentId); + ASSERT_THAT(index->last_added_document_id(), Eq(kDocumentId)); + + constexpr DocumentId kNextDocumentId = 100; + ASSERT_THAT(kNextDocumentId, Lt(kDocumentId)); + index->set_last_added_document_id(kNextDocumentId); + // last_added_document_id() should remain unchanged. + EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, Optimize) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10), + /*ref_document_id=*/0)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/3), + /*ref_document_id=*/0)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/8, /*joinable_property_id=*/9), + /*ref_document_id=*/2)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/13, /*joinable_property_id=*/4), + /*ref_document_id=*/12)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/21, /*joinable_property_id=*/12), + /*ref_document_id=*/12)); + index->set_last_added_document_id(21); + + ASSERT_THAT(index, Pointee(SizeIs(5))); + + // Used doc id: 0, 2, 3, 5, 8, 12, 13, 21. + // Delete doc id = 2, 5, compress and keep the rest. + std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId); + document_id_old_to_new[0] = 0; + document_id_old_to_new[3] = 1; + document_id_old_to_new[8] = 2; + document_id_old_to_new[12] = 3; + document_id_old_to_new[13] = 4; + document_id_old_to_new[21] = 5; + + DocumentId new_last_added_document_id = 5; + EXPECT_THAT( + index->Optimize(document_id_old_to_new, new_last_added_document_id), + IsOk()); + EXPECT_THAT(index, Pointee(SizeIs(3))); + EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id)); + + // Verify Put and Get API still work normally after Optimize(). + // (old_doc_id=3, joinable_property_id=10) had old referenced doc_id = 0, + // which is now (doc_id=1, joinable_property_id=10) and referenced doc_id = 0. + EXPECT_THAT( + index->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/10)), + IsOkAndHolds(0)); + + // (old_doc_id=5, joinable_property_id=3) had old referenced doc_id = 0, + // which is now not found since we've deleted old_doc_id = 5. It is not + // testable via Get() because there is no valid doc_id mapping for old_doc_id + // = 5 and we cannot generate a valid DocJoinInfo for it. + + // (old_doc_id=8, joinable_property_id=9) had old referenced doc_id = 2, + // which is now (doc_id=2, joinable_property_id=9), but since we've deleted + // old referenced doc_id = 2, this data should not be found after + // optimization. + EXPECT_THAT( + index->Get(DocJoinInfo(/*document_id=*/2, /*joinable_property_id=*/9)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // (old_doc_id=13, joinable_property_id=4) had old referenced doc_id = 12, + // which is now (doc_id=4, joinable_property_id=4) and referenced doc_id = 3. + EXPECT_THAT( + index->Get(DocJoinInfo(/*document_id=*/4, /*joinable_property_id=*/4)), + IsOkAndHolds(3)); + + // (old_doc_id=21, joinable_property_id=12) had old referenced doc_id = 12, + // which is now (doc_id=5, joinable_property_id=12) and referenced doc_id = 3. + EXPECT_THAT( + index->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/12)), + IsOkAndHolds(3)); + + // Joinable index should be able to work normally after Optimize(). + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/2), + /*ref_document_id=*/90)); + index->set_last_added_document_id(99); + + EXPECT_THAT(index, Pointee(SizeIs(4))); + EXPECT_THAT(index->last_added_document_id(), Eq(99)); + EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/99, + /*joinable_property_id=*/2)), + IsOkAndHolds(90)); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, OptimizeOutOfRangeDocumentId) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/10), + /*ref_document_id=*/91)); + index->set_last_added_document_id(99); + + // Create document_id_old_to_new with size = 1. Optimize should handle out of + // range DocumentId properly. + std::vector<DocumentId> document_id_old_to_new = {kInvalidDocumentId}; + + // There shouldn't be any error due to vector index. + EXPECT_THAT( + index->Optimize(document_id_old_to_new, + /*new_last_added_document_id=*/kInvalidDocumentId), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + // Verify all data are discarded after Optimize(). + EXPECT_THAT(index, Pointee(IsEmpty())); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, OptimizeDeleteAll) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10), + /*ref_document_id=*/0)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/3), + /*ref_document_id=*/0)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/8, /*joinable_property_id=*/9), + /*ref_document_id=*/2)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/13, /*joinable_property_id=*/4), + /*ref_document_id=*/12)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/21, /*joinable_property_id=*/12), + /*ref_document_id=*/12)); + index->set_last_added_document_id(21); + + // Delete all documents. + std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId); + + EXPECT_THAT( + index->Optimize(document_id_old_to_new, + /*new_last_added_document_id=*/kInvalidDocumentId), + IsOk()); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + + // Verify all data are discarded after Optimize(). + EXPECT_THAT(index, Pointee(IsEmpty())); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, Clear) { + DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20); + DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/5); + DocJoinInfo target_info3(/*document_id=*/6, /*joinable_property_id=*/13); + + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + ICING_ASSERT_OK(index->Put(target_info1, /*ref_document_id=*/0)); + ICING_ASSERT_OK(index->Put(target_info2, /*ref_document_id=*/2)); + ICING_ASSERT_OK(index->Put(target_info3, /*ref_document_id=*/4)); + ASSERT_THAT(index, Pointee(SizeIs(3))); + index->set_last_added_document_id(6); + ASSERT_THAT(index->last_added_document_id(), Eq(6)); + + // After resetting, last_added_document_id should be set to + // kInvalidDocumentId, and the previous added data should be deleted. + EXPECT_THAT(index->Clear(), IsOk()); + EXPECT_THAT(index, Pointee(IsEmpty())); + EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId)); + EXPECT_THAT(index->Get(target_info1), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(index->Get(target_info2), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(index->Get(target_info3), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // Joinable index should be able to work normally after Clear(). + DocJoinInfo target_info4(/*document_id=*/2, /*joinable_property_id=*/19); + ICING_ASSERT_OK(index->Put(target_info4, /*ref_document_id=*/0)); + index->set_last_added_document_id(2); + + EXPECT_THAT(index->last_added_document_id(), Eq(2)); + EXPECT_THAT(index->Get(target_info4), IsOkAndHolds(0)); + + ICING_ASSERT_OK(index->PersistToDisk()); + index.reset(); + + // Verify index after reconstructing. + ICING_ASSERT_OK_AND_ASSIGN( + index, QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + EXPECT_THAT(index->last_added_document_id(), Eq(2)); + EXPECT_THAT(index->Get(target_info1), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(index->Get(target_info2), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(index->Get(target_info3), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(index->Get(target_info4), IsOkAndHolds(0)); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/query/advanced_query_parser/lexer_test.cc b/icing/query/advanced_query_parser/lexer_test.cc index c6d215c..ec0e663 100644 --- a/icing/query/advanced_query_parser/lexer_test.cc +++ b/icing/query/advanced_query_parser/lexer_test.cc @@ -661,6 +661,19 @@ TEST(LexerTest, WhiteSpacesDoNotAffectColonTokenization) { EqualsLexerToken("h", Lexer::TokenType::TEXT))); } +// For the "bar:baz" part to be treated as a TEXT token in a query like +// foo:bar:baz, an explicit escape is required, so use foo:bar\:baz instead. +TEST(LexerTest, ColonInTextRequiresExplicitEscaping) { + std::unique_ptr<Lexer> lexer = + std::make_unique<Lexer>("foo:bar\\:baz", Lexer::Language::QUERY); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens, + lexer->ExtractTokens()); + EXPECT_THAT(tokens, + ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT), + EqualsLexerToken(":", Lexer::TokenType::COMPARATOR), + EqualsLexerToken("bar:baz", Lexer::TokenType::TEXT))); +} + TEST(LexerTest, QueryShouldRejectTokensBeyondLimit) { std::string query; for (int i = 0; i < Lexer::kMaxNumTokens + 1; ++i) { diff --git a/icing/query/advanced_query_parser/query-visitor.cc b/icing/query/advanced_query_parser/query-visitor.cc index 9df1264..a1a9c38 100644 --- a/icing/query/advanced_query_parser/query-visitor.cc +++ b/icing/query/advanced_query_parser/query-visitor.cc @@ -344,8 +344,10 @@ QueryVisitor::PopPendingIterator() { return CreateTermIterator(std::move(string_value)); } else { ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue()); - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> token_itr, - tokenizer_.Tokenize(text_value.term)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<Tokenizer::Iterator> token_itr, + tokenizer_.Tokenize(text_value.term, + LanguageSegmenter::AccessType::kForwardIterator)); std::string normalized_term; std::vector<std::unique_ptr<DocHitInfoIterator>> iterators; // The tokenizer will produce 1+ tokens out of the text. The prefix operator diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index 1176eaf..a94775d 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -488,7 +488,10 @@ void GetEntriesFromProperty(const PropertyProto* current_property, current_property->string_values_size(), /*index=*/i, property_path)); std::string_view value = current_property->string_values(i); std::unique_ptr<Tokenizer::Iterator> iterator = - tokenizer->Tokenize(value).ValueOrDie(); + tokenizer + ->Tokenize(value, + LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); // All iterators are moved through positions sequentially. Constructing them // each time resets them to the beginning of the string. This means that, // for t tokens and in a string of n chars, each MoveToUtf8 call from the diff --git a/icing/schema/joinable-property-manager.cc b/icing/schema/joinable-property-manager.cc index 5f8f7b8..3977b6b 100644 --- a/icing/schema/joinable-property-manager.cc +++ b/icing/schema/joinable-property-manager.cc @@ -16,6 +16,8 @@ #include <memory> #include <string> +#include <string_view> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -36,15 +38,16 @@ namespace { // Helper function to append a new joinable property metadata libtextclassifier3::Status AppendNewJoinablePropertyMetadata( - std::vector<JoinablePropertyMetadata>* metadata_list, + JoinablePropertyManager::JoinablePropertyMetadataListWrapper* + metadata_list_wrapper, std::string&& concatenated_path, PropertyConfigProto::DataType::Code data_type, JoinableConfig::ValueType::Code value_type) { // Validates next joinable property id, makes sure that joinable property id // is the same as the list index so that we could find any joinable property // metadata by id in O(1) later. - JoinablePropertyId new_id = - static_cast<JoinablePropertyId>(metadata_list->size()); + JoinablePropertyId new_id = static_cast<JoinablePropertyId>( + metadata_list_wrapper->metadata_list.size()); if (!IsJoinablePropertyIdValid(new_id)) { // Max number of joinable properties reached return absl_ports::OutOfRangeError( @@ -54,8 +57,10 @@ libtextclassifier3::Status AppendNewJoinablePropertyMetadata( } // Creates joinable property metadata - metadata_list->push_back(JoinablePropertyMetadata( + metadata_list_wrapper->metadata_list.push_back(JoinablePropertyMetadata( new_id, data_type, value_type, std::move(concatenated_path))); + metadata_list_wrapper->property_path_to_id_map.insert( + {metadata_list_wrapper->metadata_list.back().path, new_id}); return libtextclassifier3::Status::OK; } @@ -84,7 +89,8 @@ JoinablePropertyManager::Builder::ProcessSchemaTypePropertyConfig( SchemaTypeId schema_type_id, const PropertyConfigProto& property_config, std::string&& property_path) { if (schema_type_id < 0 || - schema_type_id >= joinable_property_metadata_cache_.size()) { + schema_type_id >= + static_cast<int64_t>(joinable_property_metadata_cache_.size())) { return absl_ports::InvalidArgumentError("Invalid schema type id"); } @@ -139,10 +145,33 @@ JoinablePropertyManager::ExtractJoinableProperties( libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> JoinablePropertyManager::GetJoinablePropertyMetadata( + SchemaTypeId schema_type_id, const std::string& property_path) const { + if (schema_type_id < 0 || + schema_type_id >= + static_cast<int64_t>(joinable_property_metadata_cache_.size())) { + return absl_ports::InvalidArgumentError("Invalid schema type id"); + } + + const auto iter = joinable_property_metadata_cache_[schema_type_id] + .property_path_to_id_map.find(property_path); + if (iter == joinable_property_metadata_cache_[schema_type_id] + .property_path_to_id_map.end()) { + return absl_ports::NotFoundError( + "Property path is not joinable or doesn't exist"); + } + + JoinablePropertyId joinable_property_id = iter->second; + return &joinable_property_metadata_cache_[schema_type_id] + .metadata_list[joinable_property_id]; +} + +libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> +JoinablePropertyManager::GetJoinablePropertyMetadata( SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id) const { if (schema_type_id < 0 || - schema_type_id >= joinable_property_metadata_cache_.size()) { + schema_type_id >= + static_cast<int64_t>(joinable_property_metadata_cache_.size())) { return absl_ports::InvalidArgumentError("Invalid schema type id"); } if (!IsJoinablePropertyIdValid(joinable_property_id)) { @@ -150,9 +179,9 @@ JoinablePropertyManager::GetJoinablePropertyMetadata( "Invalid joinable property id %d", joinable_property_id)); } - const std::vector<JoinablePropertyMetadata>& joinable_property_metadatas = - joinable_property_metadata_cache_[schema_type_id]; - if (joinable_property_id >= joinable_property_metadatas.size()) { + const std::vector<JoinablePropertyMetadata>& metadata_list = + joinable_property_metadata_cache_[schema_type_id].metadata_list; + if (joinable_property_id >= metadata_list.size()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Joinable property with id %d doesn't exist in type config id %d", joinable_property_id, schema_type_id)); @@ -160,7 +189,7 @@ JoinablePropertyManager::GetJoinablePropertyMetadata( // The index of metadata list is the same as the joinable property id, so we // can use joinable property id as the index. - return &joinable_property_metadatas[joinable_property_id]; + return &metadata_list[joinable_property_id]; } libtextclassifier3::StatusOr<const std::vector<JoinablePropertyMetadata>*> @@ -168,7 +197,7 @@ JoinablePropertyManager::GetMetadataList( const std::string& type_config_name) const { ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id, schema_type_mapper_.Get(type_config_name)); - return &joinable_property_metadata_cache_.at(schema_type_id); + return &joinable_property_metadata_cache_.at(schema_type_id).metadata_list; } } // namespace lib diff --git a/icing/schema/joinable-property-manager.h b/icing/schema/joinable-property-manager.h index a175ae4..c7038ce 100644 --- a/icing/schema/joinable-property-manager.h +++ b/icing/schema/joinable-property-manager.h @@ -17,6 +17,7 @@ #include <memory> #include <string> +#include <unordered_map> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -34,6 +35,13 @@ namespace lib { // from documents. class JoinablePropertyManager { public: + // A wrapper class that contains a vector of metadatas and property path to + // JoinablePropertyId reverse lookup map. + struct JoinablePropertyMetadataListWrapper { + std::vector<JoinablePropertyMetadata> metadata_list; + std::unordered_map<std::string, JoinablePropertyId> property_path_to_id_map; + }; + // Builder class to create a JoinablePropertyManager which does not take // ownership of any input components, and all pointers must refer to valid // objects that outlive the created JoinablePropertyManager instance. @@ -66,7 +74,7 @@ class JoinablePropertyManager { private: const KeyMapper<SchemaTypeId>& schema_type_mapper_; // Does not own. - std::vector<std::vector<JoinablePropertyMetadata>> + std::vector<JoinablePropertyMetadataListWrapper> joinable_property_metadata_cache_; }; @@ -87,11 +95,23 @@ class JoinablePropertyManager { libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties( const DocumentProto& document) const; + // Returns the JoinablePropertyMetadata associated with property_path that's + // in the SchemaTypeId. + // + // Returns: + // - Valid pointer to JoinablePropertyMetadata on success + // - INVALID_ARGUMENT_ERROR if schema type id is invalid + // - NOT_FOUND_ERROR if property_path doesn't exist (or is not joinable) in + // the joinable metadata list of the schema + libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> + GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, + const std::string& property_path) const; + // Returns the JoinablePropertyMetadata associated with the JoinablePropertyId // that's in the SchemaTypeId. // // Returns: - // - Pointer to JoinablePropertyMetadata on success + // - Valid pointer to JoinablePropertyMetadata on success // - INVALID_ARGUMENT_ERROR if schema type id or JoinablePropertyId is // invalid libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> @@ -108,7 +128,7 @@ class JoinablePropertyManager { private: explicit JoinablePropertyManager( const KeyMapper<SchemaTypeId>& schema_type_mapper, - std::vector<std::vector<JoinablePropertyMetadata>>&& + std::vector<JoinablePropertyMetadataListWrapper>&& joinable_property_metadata_cache) : schema_type_mapper_(schema_type_mapper), joinable_property_metadata_cache_(joinable_property_metadata_cache) {} @@ -117,16 +137,20 @@ class JoinablePropertyManager { const KeyMapper<SchemaTypeId>& schema_type_mapper_; // Does not own // The index of joinable_property_metadata_cache_ corresponds to a schema - // type's SchemaTypeId. At that SchemaTypeId index, we store an inner vector. - // The inner vector's index corresponds to a joinable property's - // JoinablePropertyId. At the JoinablePropertyId index, we store the - // JoinablePropertyMetadata of that joinable property. + // type's SchemaTypeId. At that SchemaTypeId index, we store a + // JoinablePropertyMetadataListWrapper instance. The metadata list's index + // corresponds to a joinable property's JoinablePropertyId. At the + // JoinablePropertyId index, we store the JoinablePropertyMetadata of that + // joinable property. // // For example, suppose "email" has a SchemaTypeId of 0 and it has a joinable // property called "senderQualifiedId" with a JoinablePropertyId of 1. Then // the "senderQualifiedId" property's JoinablePropertyMetadata will be at - // joinable_property_metadata_cache_[0][1]. - const std::vector<std::vector<JoinablePropertyMetadata>> + // joinable_property_metadata_cache_[0].metadata_list[1], and + // joinable_property_metadata_cache_[0] + // .property_path_to_id_map["senderQualifiedId"] + // will be 1. + const std::vector<JoinablePropertyMetadataListWrapper> joinable_property_metadata_cache_; }; diff --git a/icing/schema/joinable-property-manager_test.cc b/icing/schema/joinable-property-manager_test.cc index 495c254..d9a3841 100644 --- a/icing/schema/joinable-property-manager_test.cc +++ b/icing/schema/joinable-property-manager_test.cc @@ -410,6 +410,94 @@ TEST_F(JoinablePropertyManagerTest, StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } +TEST_F(JoinablePropertyManagerTest, GetJoinablePropertyMetadataByPath) { + // Use SchemaTypeManager factory method to instantiate + // JoinablePropertyManager. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaTypeManager> schema_type_manager, + SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get())); + + // Email (joinable property id -> joinable property path): + // 0 -> receiverQualifiedId + // 1 -> senderQualifiedId + EXPECT_THAT( + schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/0, + "receiverQualifiedId"), + IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata( + /*expected_id=*/0, /*expected_property_path=*/"receiverQualifiedId", + CreateReceiverQualifiedIdPropertyConfig())))); + EXPECT_THAT( + schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/0, + "senderQualifiedId"), + IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata( + /*expected_id=*/1, /*expected_property_path=*/"senderQualifiedId", + CreateSenderQualifiedIdPropertyConfig())))); + + // Conversation (joinable property id -> joinable property path): + // 0 -> emails.receiverQualifiedId + // 1 -> emails.senderQualifiedId + // 2 -> groupQualifiedId + EXPECT_THAT(schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/1, + "emails.receiverQualifiedId"), + IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata( + /*expected_id=*/0, + /*expected_property_path=*/"emails.receiverQualifiedId", + CreateReceiverQualifiedIdPropertyConfig())))); + EXPECT_THAT(schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/1, + "emails.senderQualifiedId"), + IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata( + /*expected_id=*/1, + /*expected_property_path=*/"emails.senderQualifiedId", + CreateSenderQualifiedIdPropertyConfig())))); + EXPECT_THAT( + schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/1, + "groupQualifiedId"), + IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata( + /*expected_id=*/2, /*expected_property_path=*/"groupQualifiedId", + CreateGroupQualifiedIdPropertyConfig())))); +} + +TEST_F(JoinablePropertyManagerTest, + GetJoinablePropertyMetadataByPathInvalidSchemaTypeId) { + // Use SchemaTypeManager factory method to instantiate + // JoinablePropertyManager. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaTypeManager> schema_type_manager, + SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get())); + ASSERT_THAT(type_config_map_, SizeIs(2)); + + EXPECT_THAT(schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/-1, + "receiverQualifiedId"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/2, + "receiverQualifiedId"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(JoinablePropertyManagerTest, GetJoinablePropertyMetadataByPathNotExist) { + // Use SchemaTypeManager factory method to instantiate + // JoinablePropertyManager. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaTypeManager> schema_type_manager, + SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get())); + + EXPECT_THAT( + schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/0, "nonExistingPath"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(schema_type_manager->joinable_property_manager() + .GetJoinablePropertyMetadata(/*schema_type_id=*/1, + "emails.nonExistingPath"), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + // Note: valid GetMetadataList has been tested in // JoinablePropertyManagerBuildTest. TEST_F(JoinablePropertyManagerTest, GetMetadataListInvalidSchemaTypeName) { diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index 79ec49a..0e0c917 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -35,6 +35,7 @@ #include "icing/proto/logging.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/storage.pb.h" +#include "icing/schema/joinable-property.h" #include "icing/schema/schema-type-manager.h" #include "icing/schema/schema-util.h" #include "icing/schema/section.h" @@ -533,6 +534,21 @@ libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections( return schema_type_manager_->section_manager().ExtractSections(document); } +libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> +SchemaStore::GetJoinablePropertyMetadata( + SchemaTypeId schema_type_id, const std::string& property_path) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); + return schema_type_manager_->joinable_property_manager() + .GetJoinablePropertyMetadata(schema_type_id, property_path); +} + +libtextclassifier3::StatusOr<JoinablePropertyGroup> +SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); + return schema_type_manager_->joinable_property_manager() + .ExtractJoinableProperties(document); +} + libtextclassifier3::Status SchemaStore::PersistToDisk() { if (!has_schema_successfully_set_) { return libtextclassifier3::Status::OK; diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index 8b85fc8..601d22a 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -31,6 +31,7 @@ #include "icing/proto/logging.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/storage.pb.h" +#include "icing/schema/joinable-property.h" #include "icing/schema/schema-type-manager.h" #include "icing/schema/schema-util.h" #include "icing/schema/section.h" @@ -196,16 +197,16 @@ class SchemaStore { // SchemaTypeId. // // Returns: - // pointer to SectionMetadata on success + // Valid pointer to SectionMetadata on success // FAILED_PRECONDITION if schema hasn't been set yet - // INVALID_ARGUMENT if schema type id or section is invalid + // INVALID_ARGUMENT if schema type id or section id is invalid libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( SchemaTypeId schema_type_id, SectionId section_id) const; // Extracts all sections of different types from the given document and group // them by type. // - Each Section vector is sorted by section Id in ascending order. The - // sorted section Ids may not be continuous, since not all section Ids are + // sorted section ids may not be continuous, since not all sections are // present in the document. // - Sections with empty content won't be returned. // - For example, we may extract: @@ -219,6 +220,34 @@ class SchemaStore { libtextclassifier3::StatusOr<SectionGroup> ExtractSections( const DocumentProto& document) const; + // Returns the JoinablePropertyMetadata associated with property_path that's + // in the SchemaTypeId. + // + // Returns: + // Valid pointer to JoinablePropertyMetadata on success + // FAILED_PRECONDITION if schema hasn't been set yet + // INVALID_ARGUMENT if schema type id is invalid + // NOT_FOUND if property_path doesn't exist (or is not joinable) in the + // joinable metadata list of the schema + libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> + GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, + const std::string& property_path) const; + + // Extracts all joinable property contents of different types from the given + // document and group them by joinable value type. + // - Joinable properties are sorted by joinable property id in ascending + // order. The sorted joinable property ids may not be continuous, since not + // all joinable properties are present in the document. + // - Joinable property ids start from 0. + // - Joinable properties with empty content won't be returned. + // + // Returns: + // A JoinablePropertyGroup instance on success + // FAILED_PRECONDITION if schema hasn't been set yet + // NOT_FOUND if the type config name of document not found + libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties( + const DocumentProto& document) const; + // Syncs all the data changes to disk. // // Returns: diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index 749fcaa..4e2724f 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -320,6 +320,9 @@ TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) { StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(store->GetSectionMetadata(/*schema_type_id=*/0, /*section_id=*/0), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(store->GetJoinablePropertyMetadata(/*schema_type_id=*/0, + /*property_path=*/"A"), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); // The apis to extract content from a document should fail gracefully. DocumentProto doc; @@ -329,6 +332,8 @@ TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) { EXPECT_THAT(store->ExtractSections(doc), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(store->ExtractJoinableProperties(doc), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); // The apis to persist and checksum data should succeed. EXPECT_THAT(store->ComputeChecksum(), IsOkAndHolds(Crc32())); diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index dc7b0a4..fd790cf 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -325,14 +325,15 @@ IcuLanguageSegmenter::IcuLanguageSegmenter(std::string locale) : locale_(std::move(locale)) {} libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> -IcuLanguageSegmenter::Segment(const std::string_view text) const { +IcuLanguageSegmenter::Segment(const std::string_view text, + LanguageSegmenter::AccessType) const { return IcuLanguageSegmenterIterator::Create(text, locale_); } libtextclassifier3::StatusOr<std::vector<std::string_view>> IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const { ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator, - Segment(text)); + IcuLanguageSegmenterIterator::Create(text, locale_)); std::vector<std::string_view> terms; while (iterator->Advance()) { terms.push_back(iterator->GetTerm()); diff --git a/icing/tokenization/icu/icu-language-segmenter.h b/icing/tokenization/icu/icu-language-segmenter.h index 4115461..f9cfbcb 100644 --- a/icing/tokenization/icu/icu-language-segmenter.h +++ b/icing/tokenization/icu/icu-language-segmenter.h @@ -55,7 +55,7 @@ class IcuLanguageSegmenter : public LanguageSegmenter { // An iterator of terms on success // INTERNAL_ERROR if any error occurs libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> - Segment(std::string_view text) const override; + Segment(std::string_view text, LanguageSegmenter::AccessType) const override; // The segmentation depends on the language detected in the input text. // diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index 6771050..c88b992 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -419,8 +419,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) { // iterator is done. text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '), "Hello", " ", "World"); - ICING_ASSERT_OK_AND_ASSIGN(auto itr, - language_segmenter->Segment(text_with_spaces)); + ICING_ASSERT_OK_AND_ASSIGN( + auto itr, + language_segmenter->Segment( + text_with_spaces, LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> terms; while (itr->Advance()) { terms.push_back(itr->GetTerm()); @@ -516,8 +518,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartUtf32WordConnector) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "com.google.android is package"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "com.google.android is package" // ^ ^^ ^^ @@ -533,8 +537,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStartUtf32) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -550,8 +556,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -568,8 +576,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -588,8 +598,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStartUtf32) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -607,8 +619,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "package com.google.android name"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "package com.google.android name" // ^ ^^ ^^ @@ -630,8 +644,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32OutOfBounds) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -661,13 +677,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); @@ -683,13 +701,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); @@ -705,13 +725,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); @@ -731,13 +753,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_and_reset_terms = GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); @@ -754,13 +778,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_and_reset_terms = GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); @@ -777,13 +803,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_and_reset_terms = GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); @@ -800,7 +828,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, GetSegmenterOptions(GetLocale(), jni_cache_.get()))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment("How are you你好吗お元気ですか")); + language_segmenter->Segment( + "How are you你好吗お元気ですか", + LanguageSegmenter::AccessType::kForwardIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -837,8 +867,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Multiple continuous whitespaces are treated as one. constexpr std::string_view kTextWithSpace = "Hello World"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kTextWithSpace)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kTextWithSpace, LanguageSegmenter::AccessType::kForwardIterator)); // String: "Hello World" // ^ ^ ^ @@ -877,8 +909,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) { // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that // don't have whitespaces as word delimiter. Chinese constexpr std::string_view kChinese = "我每天走路去上班。"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kChinese)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kChinese, LanguageSegmenter::AccessType::kForwardIterator)); // String: "我每天走路去上班。" // ^ ^ ^ ^^ ^ // UTF-8 idx: 0 3 9 15 18 24 @@ -904,8 +938,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Japanese constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kJapanese)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kJapanese, LanguageSegmenter::AccessType::kForwardIterator)); // String: "私は毎日仕事に歩いています。" // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ // UTF-8 idx: 0 3 6 12 18212427 33 39 @@ -930,8 +966,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) { language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kKhmer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kKhmer, LanguageSegmenter::AccessType::kForwardIterator)); // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" // ^ ^ ^ ^ ^ // UTF-8 idx: 0 9 24 45 69 @@ -957,8 +995,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfterUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Thai constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kThai)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kThai, LanguageSegmenter::AccessType::kForwardIterator)); // String: "ฉันเดินไปทำงานทุกวัน" // ^ ^ ^ ^ ^ ^ // UTF-8 idx: 0 9 21 27 42 51 @@ -983,8 +1023,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "package name com.google.android!"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "package name com.google.android!" // ^ ^^ ^^ ^ @@ -1006,8 +1048,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBoundsUtf32) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -1037,13 +1081,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); @@ -1061,13 +1107,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); @@ -1084,13 +1132,15 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); @@ -1107,7 +1157,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, GetSegmenterOptions(GetLocale(), jni_cache_.get()))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment("How are you你好吗お元気ですか")); + language_segmenter->Segment( + "How are you你好吗お元気ですか", + LanguageSegmenter::AccessType::kForwardIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -1145,8 +1197,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Multiple continuous whitespaces are treated as one. constexpr std::string_view kTextWithSpace = "Hello World"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kTextWithSpace)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kTextWithSpace, LanguageSegmenter::AccessType::kForwardIterator)); // String: "Hello World" // ^ ^ ^ @@ -1184,8 +1238,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBeforeUtf32) { // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that // don't have whitespaces as word delimiter. Chinese constexpr std::string_view kChinese = "我每天走路去上班。"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kChinese)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kChinese, LanguageSegmenter::AccessType::kForwardIterator)); // String: "我每天走路去上班。" // ^ ^ ^ ^^ // UTF-8 idx: 0 3 9 15 18 @@ -1208,8 +1264,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBeforeUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Japanese constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kJapanese)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kJapanese, LanguageSegmenter::AccessType::kForwardIterator)); // String: "私は毎日仕事に歩いています。" // ^ ^ ^ ^ ^ ^ ^ ^ ^ // UTF-8 idx: 0 3 6 12 18212427 33 @@ -1231,8 +1289,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBeforeUtf32) { language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kKhmer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kKhmer, LanguageSegmenter::AccessType::kForwardIterator)); // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" // ^ ^ ^ ^ // UTF-8 idx: 0 9 24 45 @@ -1255,8 +1315,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBeforeUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Thai constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kThai)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kThai, LanguageSegmenter::AccessType::kForwardIterator)); // String: "ฉันเดินไปทำงานทุกวัน" // ^ ^ ^ ^ ^ ^ // UTF-8 idx: 0 9 21 27 42 51 diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc index 3aff45c..b14ce19 100644 --- a/icing/tokenization/language-segmenter-iterator_test.cc +++ b/icing/tokenization/language-segmenter-iterator_test.cc @@ -54,8 +54,10 @@ TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, - language_segmenter->Segment("foo bar")); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + "foo bar", LanguageSegmenter::AccessType::kForwardIterator)); EXPECT_TRUE(iterator->Advance()); EXPECT_THAT(iterator->GetTerm(), Eq("foo")); @@ -76,8 +78,10 @@ TEST_F(LanguageSegmenterIteratorTest, ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, - language_segmenter->Segment("foo bar")); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator)); EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/0), IsOkAndHolds(3)); // The term " " @@ -94,8 +98,10 @@ TEST_F(LanguageSegmenterIteratorTest, ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, - language_segmenter->Segment("foo bar")); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator)); EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-1), IsOk()); @@ -113,7 +119,10 @@ TEST_F(LanguageSegmenterIteratorTest, ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + text, LanguageSegmenter::AccessType::kBidirectionalIterator)); EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/text.length()), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); @@ -127,7 +136,10 @@ TEST_F(LanguageSegmenterIteratorTest, ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + text, LanguageSegmenter::AccessType::kBidirectionalIterator)); EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/100), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); @@ -140,8 +152,10 @@ TEST_F(LanguageSegmenterIteratorTest, ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, - language_segmenter->Segment("foo bar")); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator)); EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/6), IsOkAndHolds(3)); // The term " " @@ -158,8 +172,10 @@ TEST_F(LanguageSegmenterIteratorTest, ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, - language_segmenter->Segment("foo bar")); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator)); // Zero is a valid argument, but there aren't any terms that end before it. EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/0), @@ -173,8 +189,10 @@ TEST_F(LanguageSegmenterIteratorTest, ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, - language_segmenter->Segment("foo bar")); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + "foo bar", LanguageSegmenter::AccessType::kBidirectionalIterator)); EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); @@ -191,7 +209,10 @@ TEST_F(LanguageSegmenterIteratorTest, ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); - ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); + ICING_ASSERT_OK_AND_ASSIGN( + auto iterator, + language_segmenter->Segment( + text, LanguageSegmenter::AccessType::kBidirectionalIterator)); EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length()), IsOk()); diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h index 913386a..83a47d4 100644 --- a/icing/tokenization/language-segmenter.h +++ b/icing/tokenization/language-segmenter.h @@ -38,6 +38,11 @@ namespace lib { // segmenter->GetAllTerms(text)); class LanguageSegmenter { public: + enum class AccessType { + kForwardIterator, + kBidirectionalIterator, + }; + virtual ~LanguageSegmenter() = default; // An iterator helping to find terms in the input text. @@ -165,7 +170,7 @@ class LanguageSegmenter { // outlives the returned iterator. virtual libtextclassifier3::StatusOr< std::unique_ptr<LanguageSegmenter::Iterator>> - Segment(std::string_view text) const = 0; + Segment(std::string_view text, AccessType access_type) const = 0; // Segments and returns all terms in the input text. // diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc index 748a322..50c625e 100644 --- a/icing/tokenization/language-segmenter_benchmark.cc +++ b/icing/tokenization/language-segmenter_benchmark.cc @@ -68,7 +68,10 @@ void BM_SegmentNoSpace(benchmark::State& state) { for (auto _ : state) { std::unique_ptr<LanguageSegmenter::Iterator> iterator = - language_segmenter->Segment(input_string).ValueOrDie(); + language_segmenter + ->Segment(input_string, + LanguageSegmenter::AccessType::kForwardIterator) + .ValueOrDie(); while (iterator->Advance()) { iterator->GetTerm(); } @@ -108,7 +111,10 @@ void BM_SegmentWithSpaces(benchmark::State& state) { for (auto _ : state) { std::unique_ptr<LanguageSegmenter::Iterator> iterator = - language_segmenter->Segment(input_string).ValueOrDie(); + language_segmenter + ->Segment(input_string, + LanguageSegmenter::AccessType::kForwardIterator) + .ValueOrDie(); while (iterator->Advance()) { iterator->GetTerm(); } @@ -148,7 +154,10 @@ void BM_SegmentCJK(benchmark::State& state) { for (auto _ : state) { std::unique_ptr<LanguageSegmenter::Iterator> iterator = - language_segmenter->Segment(input_string).ValueOrDie(); + language_segmenter + ->Segment(input_string, + LanguageSegmenter::AccessType::kForwardIterator) + .ValueOrDie(); while (iterator->Advance()) { iterator->GetTerm(); } diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc index d40022b..9175f3a 100644 --- a/icing/tokenization/plain-tokenizer.cc +++ b/icing/tokenization/plain-tokenizer.cc @@ -130,17 +130,19 @@ class PlainTokenIterator : public Tokenizer::Iterator { }; libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> -PlainTokenizer::Tokenize(std::string_view text) const { +PlainTokenizer::Tokenize(std::string_view text, + LanguageSegmenter::AccessType access_type) const { ICING_ASSIGN_OR_RETURN( std::unique_ptr<LanguageSegmenter::Iterator> base_iterator, - language_segmenter_.Segment(text)); + language_segmenter_.Segment(text, access_type)); return std::make_unique<PlainTokenIterator>(std::move(base_iterator)); } libtextclassifier3::StatusOr<std::vector<Token>> PlainTokenizer::TokenizeAll( std::string_view text) const { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator, - Tokenize(text)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<Tokenizer::Iterator> iterator, + Tokenize(text, LanguageSegmenter::AccessType::kForwardIterator)); std::vector<Token> tokens; while (iterator->Advance()) { std::vector<Token> batch_tokens = iterator->GetTokens(); diff --git a/icing/tokenization/plain-tokenizer.h b/icing/tokenization/plain-tokenizer.h index 25b40fd..61a8b5a 100644 --- a/icing/tokenization/plain-tokenizer.h +++ b/icing/tokenization/plain-tokenizer.h @@ -33,7 +33,8 @@ class PlainTokenizer : public Tokenizer { : language_segmenter_(*language_segmenter) {} libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize( - std::string_view text) const override; + std::string_view text, + LanguageSegmenter::AccessType access_type) const override; libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( std::string_view text) const override; diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc index 6c426da..f94a558 100644 --- a/icing/tokenization/plain-tokenizer_test.cc +++ b/icing/tokenization/plain-tokenizer_test.cc @@ -25,6 +25,7 @@ #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/tokenizer-factory.h" #include "unicode/uloc.h" @@ -67,8 +68,10 @@ TEST_F(PlainTokenizerTest, NoTokensBeforeAdvancing) { language_segmenter.get())); constexpr std::string_view kText = "Hello, world!"; - ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator, - plain_tokenizer->Tokenize(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + auto token_iterator, + plain_tokenizer->Tokenize( + kText, LanguageSegmenter::AccessType::kForwardIterator)); // We should get no tokens if we get the token before advancing. EXPECT_THAT(token_iterator->GetTokens(), IsEmpty()); @@ -86,8 +89,10 @@ TEST_F(PlainTokenizerTest, LastTokenAfterFullyAdvanced) { language_segmenter.get())); constexpr std::string_view kText = "Hello, world!"; - ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator, - plain_tokenizer->Tokenize(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + auto token_iterator, + plain_tokenizer->Tokenize( + kText, LanguageSegmenter::AccessType::kForwardIterator)); while (token_iterator->Advance()) {} @@ -344,7 +349,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) { language_segmenter.get())); constexpr std::string_view kText = "f b"; - auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); + auto iterator = + plain_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0)); EXPECT_THAT(iterator->GetTokens(), @@ -365,7 +373,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) { language_segmenter.get())); constexpr std::string_view kText = "f b"; - auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); + auto iterator = + plain_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2)); EXPECT_THAT(iterator->GetTokens(), @@ -412,7 +423,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) { "bat", // 16: " bat" }; - auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); + auto iterator = + plain_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); EXPECT_TRUE(iterator->Advance()); EXPECT_THAT(iterator->GetTokens(), ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"))); @@ -466,7 +480,10 @@ TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) { "foo", // 4: "foo " }; - auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie(); + auto iterator = + plain_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); EXPECT_TRUE(iterator->Advance()); EXPECT_THAT(iterator->GetTokens(), ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"))); diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc index 1dcbf9b..aca317c 100644 --- a/icing/tokenization/raw-query-tokenizer.cc +++ b/icing/tokenization/raw-query-tokenizer.cc @@ -690,7 +690,8 @@ class RawQueryTokenIterator : public Tokenizer::Iterator { } // namespace libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> -RawQueryTokenizer::Tokenize(std::string_view text) const { +RawQueryTokenizer::Tokenize(std::string_view text, + LanguageSegmenter::AccessType) const { ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens, TokenizeAll(text)); return std::make_unique<RawQueryTokenIterator>(std::move(tokens)); } diff --git a/icing/tokenization/raw-query-tokenizer.h b/icing/tokenization/raw-query-tokenizer.h index 6316e45..1087b04 100644 --- a/icing/tokenization/raw-query-tokenizer.h +++ b/icing/tokenization/raw-query-tokenizer.h @@ -33,7 +33,7 @@ class RawQueryTokenizer : public Tokenizer { : language_segmenter_(*language_segmenter) {} libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize( - std::string_view text) const override; + std::string_view text, LanguageSegmenter::AccessType) const override; libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( std::string_view text) const override; diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index a00f2f7..2044f95 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -21,6 +21,7 @@ #include "icing/testing/icu-data-file-helper.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/tokenizer-factory.h" #include "icing/tokenization/tokenizer.h" #include "unicode/uloc.h" @@ -60,8 +61,10 @@ TEST_F(RawQueryTokenizerTest, NoTokensBeforeAdvancing) { language_segmenter.get())); constexpr std::string_view kText = "Hello, world!"; - ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator, - raw_query_tokenizer->Tokenize(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + auto token_iterator, + raw_query_tokenizer->Tokenize( + kText, LanguageSegmenter::AccessType::kForwardIterator)); // We should get no tokens if we get the token before advancing. EXPECT_THAT(token_iterator->GetTokens(), IsEmpty()); diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc index dbd7f5a..4bb7991 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc @@ -31,18 +31,13 @@ namespace icing { namespace lib { -namespace { -// Chosen based on results in go/reverse-jni-benchmarks -static constexpr int kBatchSize = 100; -} // namespace - // ----------------------------------------------------------------------------- // Implementations that call out to JVM. Behold the beauty. // ----------------------------------------------------------------------------- libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>> ReverseJniBreakIterator::Create(const JniCache* jni_cache, - std::string_view text, - std::string_view locale) { + std::string_view text, std::string_view locale, + int batch_size) { if (jni_cache == nullptr) { return absl_ports::InvalidArgumentError( "Create must be called with a valid JniCache pointer!"); @@ -90,15 +85,17 @@ ReverseJniBreakIterator::Create(const JniCache* jni_cache, ICING_RETURN_IF_ERROR(libtextclassifier3::JniHelper::CallVoidMethod( jenv, iterator_batcher.get(), jni_cache->breakiterator_settext, java_text.get())); - return std::unique_ptr<ReverseJniBreakIterator>( - new ReverseJniBreakIterator(jni_cache, std::move(iterator_batcher))); + return std::unique_ptr<ReverseJniBreakIterator>(new ReverseJniBreakIterator( + jni_cache, std::move(iterator_batcher), batch_size)); } ReverseJniBreakIterator::ReverseJniBreakIterator( const JniCache* jni_cache, - libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher) + libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher, + int batch_size) : jni_cache_(jni_cache), iterator_batcher_(std::move(iterator_batcher)), + batch_size_(batch_size), is_done_(false), is_almost_done_(false) {} @@ -113,7 +110,7 @@ int ReverseJniBreakIterator::Next() { is_done_ = true; return ReverseJniBreakIterator::kDone; } - is_almost_done_ = break_indices_cache_.size() < kBatchSize; + is_almost_done_ = break_indices_cache_.size() < batch_size_; } int break_index = break_indices_cache_.front(); break_indices_cache_.pop(); @@ -156,7 +153,7 @@ int ReverseJniBreakIterator::FetchNextBatch() { libtextclassifier3::ScopedLocalRef<jintArray> break_indices, libtextclassifier3::JniHelper::CallObjectMethod<jintArray>( jni_cache_->GetEnv(), iterator_batcher_.get(), - jni_cache_->breakiterator_next, kBatchSize), + jni_cache_->breakiterator_next, batch_size_), ReverseJniBreakIterator::kDone); if (break_indices == nullptr || jni_cache_->ExceptionCheckAndClear()) { return ReverseJniBreakIterator::kDone; diff --git a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h index 537666c..b1dcc87 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h +++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h @@ -44,6 +44,9 @@ namespace lib { // EXPECT_THAT(nexts, ElementsAre(1, 3, 5, 6, 8)); class ReverseJniBreakIterator { public: + // Chosen based on results in go/reverse-jni-benchmarks + static constexpr int kBatchSize = 100; + static constexpr int kDone = -1; // Creates a ReverseJniBreakiterator with the given text and locale. @@ -54,7 +57,7 @@ class ReverseJniBreakIterator { // INTERNAL if unable to create any of the required Java objects static libtextclassifier3::StatusOr<std::unique_ptr<ReverseJniBreakIterator>> Create(const JniCache* jni_cache, std::string_view text, - std::string_view locale); + std::string_view locale, int batch_size); // Returns the UTF-16 boundary following the current boundary. If the current // boundary is the last text boundary, it returns @@ -88,9 +91,10 @@ class ReverseJniBreakIterator { private: ReverseJniBreakIterator( const JniCache* jni_cache, - libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher); + libtextclassifier3::ScopedGlobalRef<jobject> iterator_batcher, + int batch_size); - // Fetches the results of up to kBatchSize next calls and stores them in + // Fetches the results of up to batch_size next calls and stores them in // break_indices_cache_. Returns the number of results or kDone if no more // results could be fetched. int FetchNextBatch(); @@ -109,9 +113,11 @@ class ReverseJniBreakIterator { // BreakIteratorBatcher#next. std::queue<int> break_indices_cache_; + int batch_size_; + bool is_done_; - // The last batch was incomplete (< kBatchSize results were returned). The + // The last batch was incomplete (< batch_size_ results were returned). The // next call to BreakIteratorBatcher#next is guaranteed to return an // empty array. Once the results from the last batch are evicted from // break_indices_cache, ReverseJniBreakIterator will transition to is_done_. diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index bd80718..e6bcf4b 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -293,18 +293,28 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { }; libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> -ReverseJniLanguageSegmenter::Segment(const std::string_view text) const { +ReverseJniLanguageSegmenter::Segment( + const std::string_view text, + LanguageSegmenter::AccessType access_type) const { + // Only batch if we're only doing forward iteration. Bidirectional iteration + // will result in us frequently discarding unconsumed batched word breaks. + // Therefore, we won't bother batching them. + int batch_size = + (access_type == LanguageSegmenter::AccessType::kForwardIterator) + ? ReverseJniBreakIterator::kBatchSize + : 1; ICING_ASSIGN_OR_RETURN( std::unique_ptr<ReverseJniBreakIterator> break_iterator, - ReverseJniBreakIterator::Create(jni_cache_, text, locale_)); + ReverseJniBreakIterator::Create(jni_cache_, text, locale_, batch_size)); return std::make_unique<ReverseJniLanguageSegmenterIterator>( text, std::move(break_iterator)); } libtextclassifier3::StatusOr<std::vector<std::string_view>> ReverseJniLanguageSegmenter::GetAllTerms(const std::string_view text) const { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator, - Segment(text)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<LanguageSegmenter::Iterator> iterator, + Segment(text, LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> terms; while (iterator->Advance()) { terms.push_back(iterator->GetTerm()); diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h index 29df4ee..e9f84ad 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h @@ -34,7 +34,8 @@ class ReverseJniLanguageSegmenter : public LanguageSegmenter { : locale_(std::move(locale)), jni_cache_(jni_cache) {} libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> - Segment(std::string_view text) const override; + Segment(std::string_view text, + LanguageSegmenter::AccessType access_type) const override; libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms( std::string_view text) const override; diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc index 47a01fe..be652ff 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc @@ -394,8 +394,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) { // iterator is done. text_with_spaces = absl_ports::StrCat(std::string(kNumSeparators, ' '), "Hello", " ", "World"); - ICING_ASSERT_OK_AND_ASSIGN(auto itr, - language_segmenter->Segment(text_with_spaces)); + ICING_ASSERT_OK_AND_ASSIGN( + auto itr, + language_segmenter->Segment( + text_with_spaces, LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> terms; while (itr->Advance()) { terms.push_back(itr->GetTerm()); @@ -491,8 +493,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartUtf32WordConnector) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "com:google:android is package"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "com:google:android is package" // ^ ^^ ^^ @@ -508,8 +512,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStartUtf32) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -524,8 +530,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStartUtf32) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -542,8 +550,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -562,8 +572,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStartUtf32) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -581,8 +593,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32WordConnector) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "package com:google:android name"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "package com:google:android name" // ^ ^^ ^^ @@ -604,8 +618,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32OutOfBounds) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -635,13 +651,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kText)); + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); @@ -657,13 +675,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kThai)); + segmenter->Segment( + kThai, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); @@ -679,13 +699,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kKorean)); + segmenter->Segment( + kKorean, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); @@ -705,13 +727,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, - segmenter->Segment(kText)); + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> advance_and_reset_terms = GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); @@ -728,13 +752,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, - segmenter->Segment(kThai)); + segmenter->Segment( + kThai, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> advance_and_reset_terms = GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); @@ -751,13 +777,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, - segmenter->Segment(kKorean)); + segmenter->Segment( + kKorean, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> advance_and_reset_terms = GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); @@ -773,7 +801,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfterUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment("How are you你好吗お元気ですか")); + language_segmenter->Segment( + "How are you你好吗お元気ですか", + LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -810,8 +840,11 @@ TEST_P(ReverseJniLanguageSegmenterTest, GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Multiple continuous whitespaces are treated as one. constexpr std::string_view kTextWithSpace = "Hello World"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kTextWithSpace)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kTextWithSpace, + LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "Hello World" // ^ ^ ^ @@ -850,8 +883,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) { // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that // don't have whitespaces as word delimiter. Chinese constexpr std::string_view kChinese = "我每天走路去上班。"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kChinese)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kChinese, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "我每天走路去上班。" // ^ ^ ^ ^^ ^ // UTF-8 idx: 0 3 9 15 18 24 @@ -877,8 +912,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Japanese constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kJapanese)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kJapanese, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "私は毎日仕事に歩いています。" // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ // UTF-8 idx: 0 3 6 12 18212427 33 39 @@ -903,8 +940,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) { language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kKhmer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kKhmer, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" // ^ ^ ^ ^ ^ // UTF-8 idx: 0 9 24 45 69 @@ -930,8 +969,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfterUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Thai constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kThai)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kThai, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "ฉันเดินไปทำงานทุกวัน" // ^ ^ ^ ^ ^ ^ // UTF-8 idx: 0 9 21 27 42 51 @@ -955,8 +996,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnectorUtf32) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "package name com:google:android!"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "package name com:google:android!" // ^ ^^ ^^ ^ @@ -978,8 +1021,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBoundsUtf32) { auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - segmenter->Segment(kText)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -1009,13 +1054,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kText)); + segmenter->Segment(kText, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kText)); + segmenter->Segment( + kText, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); @@ -1033,13 +1080,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kThai)); + segmenter->Segment(kThai, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kThai)); + segmenter->Segment( + kThai, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); @@ -1056,13 +1105,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, - segmenter->Segment(kKorean)); + segmenter->Segment(kKorean, + LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, - segmenter->Segment(kKorean)); + segmenter->Segment( + kKorean, LanguageSegmenter::AccessType::kBidirectionalIterator)); std::vector<std::string_view> reset_terms = GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); @@ -1078,7 +1129,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBeforeUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment("How are you你好吗お元気ですか")); + language_segmenter->Segment( + "How are you你好吗お元気ですか", + LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "How are you你好吗お元気ですか" // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ @@ -1116,8 +1169,11 @@ TEST_P(ReverseJniLanguageSegmenterTest, GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Multiple continuous whitespaces are treated as one. constexpr std::string_view kTextWithSpace = "Hello World"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kTextWithSpace)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kTextWithSpace, + LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "Hello World" // ^ ^ ^ @@ -1155,8 +1211,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBeforeUtf32) { // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that // don't have whitespaces as word delimiter. Chinese constexpr std::string_view kChinese = "我每天走路去上班。"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kChinese)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kChinese, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "我每天走路去上班。" // ^ ^ ^ ^^ // UTF-8 idx: 0 3 9 15 18 @@ -1179,8 +1237,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBeforeUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Japanese constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kJapanese)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kJapanese, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "私は毎日仕事に歩いています。" // ^ ^ ^ ^ ^ ^ ^ ^ ^ // UTF-8 idx: 0 3 6 12 18212427 33 @@ -1202,8 +1262,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBeforeUtf32) { language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kKhmer)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kKhmer, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" // ^ ^ ^ ^ // UTF-8 idx: 0 9 24 45 @@ -1226,8 +1288,10 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBeforeUtf32) { GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Thai constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; - ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, - language_segmenter->Segment(kThai)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> itr, + language_segmenter->Segment( + kThai, LanguageSegmenter::AccessType::kBidirectionalIterator)); // String: "ฉันเดินไปทำงานทุกวัน" // ^ ^ ^ ^ ^ ^ // UTF-8 idx: 0 9 21 27 42 51 diff --git a/icing/tokenization/rfc822-tokenizer.cc b/icing/tokenization/rfc822-tokenizer.cc index 13c58c5..35b82ca 100644 --- a/icing/tokenization/rfc822-tokenizer.cc +++ b/icing/tokenization/rfc822-tokenizer.cc @@ -778,14 +778,15 @@ class Rfc822TokenIterator : public Tokenizer::Iterator { }; libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> -Rfc822Tokenizer::Tokenize(std::string_view text) const { +Rfc822Tokenizer::Tokenize(std::string_view text, + LanguageSegmenter::AccessType) const { return std::make_unique<Rfc822TokenIterator>(text); } libtextclassifier3::StatusOr<std::vector<Token>> Rfc822Tokenizer::TokenizeAll( std::string_view text) const { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator, - Tokenize(text)); + std::unique_ptr<Tokenizer::Iterator> iterator = + std::make_unique<Rfc822TokenIterator>(text); std::vector<Token> tokens; while (iterator->Advance()) { std::vector<Token> batch_tokens = iterator->GetTokens(); diff --git a/icing/tokenization/rfc822-tokenizer.h b/icing/tokenization/rfc822-tokenizer.h index 09e4624..094f1cf 100644 --- a/icing/tokenization/rfc822-tokenizer.h +++ b/icing/tokenization/rfc822-tokenizer.h @@ -17,6 +17,7 @@ #include <vector> +#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/tokenizer.h" namespace icing { @@ -25,7 +26,7 @@ namespace lib { class Rfc822Tokenizer : public Tokenizer { public: libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize( - std::string_view text) const override; + std::string_view text, LanguageSegmenter::AccessType) const override; libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( std::string_view text) const override; diff --git a/icing/tokenization/rfc822-tokenizer_test.cc b/icing/tokenization/rfc822-tokenizer_test.cc index f114943..6b95a07 100644 --- a/icing/tokenization/rfc822-tokenizer_test.cc +++ b/icing/tokenization/rfc822-tokenizer_test.cc @@ -23,6 +23,7 @@ #include "icing/testing/common-matchers.h" #include "icing/testing/jni-test-helpers.h" #include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" #include "unicode/uloc.h" namespace icing { @@ -48,7 +49,10 @@ class Rfc822TokenizerTest : public testing::Test { TEST_F(Rfc822TokenizerTest, StartingState) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "a@g.c"; - auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie(); + auto token_iterator = + rfc822_tokenizer + .Tokenize(text, LanguageSegmenter::AccessType::kForwardIterator) + .ValueOrDie(); ASSERT_THAT(token_iterator->GetTokens(), IsEmpty()); ASSERT_TRUE(token_iterator->Advance()); @@ -979,7 +983,10 @@ TEST_F(Rfc822TokenizerTest, Commas) { TEST_F(Rfc822TokenizerTest, ResetToTokenStartingAfter) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "a@g.c,b@g.c"; - auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie(); + auto token_iterator = + rfc822_tokenizer + .Tokenize(text, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); ASSERT_TRUE(token_iterator->Advance()); ASSERT_TRUE(token_iterator->Advance()); @@ -995,7 +1002,10 @@ TEST_F(Rfc822TokenizerTest, ResetToTokenStartingAfter) { TEST_F(Rfc822TokenizerTest, ResetToTokenEndingBefore) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "a@g.c,b@g.c"; - auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie(); + auto token_iterator = + rfc822_tokenizer + .Tokenize(text, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); token_iterator->Advance(); ASSERT_TRUE(token_iterator->ResetToTokenEndingBefore(5)); diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h index fb7613f..3336266 100644 --- a/icing/tokenization/tokenizer.h +++ b/icing/tokenization/tokenizer.h @@ -22,6 +22,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/token.h" #include "icing/util/character-iterator.h" @@ -33,8 +34,10 @@ namespace lib { // iterator or a list of tokens. Example usage: // // std::unique_ptr<Tokenizer> tokenizer = GetTokenizer(); -// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter, -// tokenizer->Tokenize(text)); +// ICING_ASSIGN_OR_RETURN( +// std::unique_ptr<Tokenizer::Iterator> iter, +// tokenizer->Tokenize(text, +// LanguageSegmenter::AccessType::kForwardIterator)); // ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens, // tokenizer->TokenizeAll(text)); class Tokenizer { @@ -76,7 +79,10 @@ class Tokenizer { // offset. Returns false if there are no valid tokens starting after // offset. // Ex. - // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); + // auto iterator = + // tokenizer.Tokenize("foo bar baz", + // LanguageSegmenter::AccessType::kForwardIterator) + // .ValueOrDie(); // iterator.ResetToTokenStartingAfter(4); // // The first full token starting after position 4 (the 'b' in "bar") is // // "baz". @@ -89,8 +95,10 @@ class Tokenizer { // offset. Returns false if there are no valid tokens ending // before offset. // Ex. - // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); - // iterator.ResetToTokenEndingBefore(4); + // auto iterator = + // tokenizer.Tokenize("foo bar baz", + // LanguageSegmenter::AccessType::kForwardIterator) + // .ValueOrDie(); // iterator.ResetToTokenEndingBefore(4); // // The first full token ending before position 4 (the 'b' in "bar") is // // "foo". // PrintToken(iterator.GetToken()); // prints "foo" @@ -111,7 +119,8 @@ class Tokenizer { // types. // INTERNAL_ERROR if any other errors occur virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize( - std::string_view text) const = 0; + std::string_view text, + LanguageSegmenter::AccessType access_type) const = 0; // Tokenizes and returns all tokens in the input text. The input text should // outlive the returned vector. diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc index 9ca611d..cf6d5e3 100644 --- a/icing/tokenization/verbatim-tokenizer.cc +++ b/icing/tokenization/verbatim-tokenizer.cc @@ -124,14 +124,15 @@ class VerbatimTokenIterator : public Tokenizer::Iterator { }; libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> -VerbatimTokenizer::Tokenize(std::string_view text) const { +VerbatimTokenizer::Tokenize(std::string_view text, + LanguageSegmenter::AccessType) const { return std::make_unique<VerbatimTokenIterator>(text); } libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll( std::string_view text) const { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator, - Tokenize(text)); + std::unique_ptr<Tokenizer::Iterator> iterator = + std::make_unique<VerbatimTokenIterator>(text); std::vector<Token> tokens; while (iterator->Advance()) { std::vector<Token> batch = iterator->GetTokens(); diff --git a/icing/tokenization/verbatim-tokenizer.h b/icing/tokenization/verbatim-tokenizer.h index 8404cf1..c3746af 100644 --- a/icing/tokenization/verbatim-tokenizer.h +++ b/icing/tokenization/verbatim-tokenizer.h @@ -20,6 +20,7 @@ #include <vector> #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/tokenizer.h" namespace icing { @@ -29,7 +30,7 @@ namespace lib { class VerbatimTokenizer : public Tokenizer { public: libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize( - std::string_view text) const override; + std::string_view text, LanguageSegmenter::AccessType) const override; libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( std::string_view text) const override; diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc index bae69ff..5aeb343 100644 --- a/icing/tokenization/verbatim-tokenizer_test.cc +++ b/icing/tokenization/verbatim-tokenizer_test.cc @@ -22,6 +22,7 @@ #include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/token.h" #include "icing/tokenization/tokenizer-factory.h" #include "icing/util/character-iterator.h" @@ -94,7 +95,10 @@ TEST_F(VerbatimTokenizerTest, NoTokensBeforeAdvancing) { language_segmenter_.get())); constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + auto token_iterator = + verbatim_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kForwardIterator) + .ValueOrDie(); // We should get no tokens if we get the token before advancing. EXPECT_THAT(token_iterator->GetTokens(), IsEmpty()); @@ -107,7 +111,10 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) { language_segmenter_.get())); constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + auto token_iterator = + verbatim_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); // Reset to beginning of verbatim of token. We provide an offset of 13 as it // is larger than the final index (12) of the verbatim token. @@ -134,7 +141,10 @@ TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) { language_segmenter_.get())); constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + auto token_iterator = + verbatim_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); // Get token without resetting EXPECT_TRUE(token_iterator->Advance()); @@ -159,7 +169,10 @@ TEST_F(VerbatimTokenizerTest, ResetToStart) { language_segmenter_.get())); constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + auto token_iterator = + verbatim_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kBidirectionalIterator) + .ValueOrDie(); // Get token without resetting EXPECT_TRUE(token_iterator->Advance()); @@ -179,7 +192,10 @@ TEST_F(VerbatimTokenizerTest, CalculateTokenStart) { language_segmenter_.get())); constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + auto token_iterator = + verbatim_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kForwardIterator) + .ValueOrDie(); ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator start_character_iterator, token_iterator->CalculateTokenStart()); @@ -195,7 +211,10 @@ TEST_F(VerbatimTokenizerTest, CalculateTokenEnd) { language_segmenter_.get())); constexpr std::string_view kText = "Hello, world!"; - auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie(); + auto token_iterator = + verbatim_tokenizer + ->Tokenize(kText, LanguageSegmenter::AccessType::kForwardIterator) + .ValueOrDie(); ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator end_character_iterator, token_iterator->CalculateTokenEndExclusive()); diff --git a/icing/util/tokenized-document.cc b/icing/util/tokenized-document.cc index facb267..1c11c3c 100644 --- a/icing/util/tokenized-document.cc +++ b/icing/util/tokenized-document.cc @@ -44,8 +44,10 @@ libtextclassifier3::StatusOr<std::vector<TokenizedSection>> Tokenize( section.metadata.tokenizer, language_segmenter)); std::vector<std::string_view> token_sequence; for (std::string_view subcontent : section.content) { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr, - tokenizer->Tokenize(subcontent)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<Tokenizer::Iterator> itr, + tokenizer->Tokenize(subcontent, + LanguageSegmenter::AccessType::kForwardIterator)); while (itr->Advance()) { std::vector<Token> batch_tokens = itr->GetTokens(); for (const Token& token : batch_tokens) { diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 5838a7b..232fbe0 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=513864120) +set(synced_AOSP_CL_number=-514555603) |