diff options
author | Tim Barron <tjbarron@google.com> | 2022-05-12 19:44:56 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2022-05-12 19:44:56 +0000 |
commit | 8c72386649cb1f0950a4cb24cbaca2ccf16abd91 (patch) | |
tree | 7642107143cc381a8442c42b110d685ee0fa1b6d | |
parent | fcdd04433975fc015fe971432a5e9f77240ce2f5 (diff) | |
parent | 34f7ab9226072548dae9f0eaa5ec65010657ee23 (diff) | |
download | icing-8c72386649cb1f0950a4cb24cbaca2ccf16abd91.tar.gz |
Merge "Stop dropping nonascii/nonalnum segments." into tm-dev am: 34f7ab9226
Original change: https://googleplex-android-review.googlesource.com/c/platform/external/icing/+/18328740
Change-Id: Iaf68a7b903f52f049a53bfffb4128952b6d57c7b
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r-- | icing/icing-search-engine_test.cc | 135 | ||||
-rw-r--r-- | icing/tokenization/combined-tokenizer_test.cc | 232 | ||||
-rw-r--r-- | icing/tokenization/icu/icu-language-segmenter.cc | 59 | ||||
-rw-r--r-- | icing/tokenization/icu/icu-language-segmenter_test.cc | 48 | ||||
-rw-r--r-- | icing/tokenization/raw-query-tokenizer.cc | 84 | ||||
-rw-r--r-- | icing/tokenization/raw-query-tokenizer_test.cc | 89 | ||||
-rw-r--r-- | icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc | 80 | ||||
-rw-r--r-- | icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc | 49 | ||||
-rw-r--r-- | icing/tokenization/tokenizer.h | 9 |
9 files changed, 586 insertions, 199 deletions
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index 7ed8885..5244f4c 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -8082,6 +8082,141 @@ TEST_F(IcingSearchEngineTest, CJKSnippetTest) { EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); } +TEST_F(IcingSearchEngineTest, InvalidToEmptyQueryTest) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // String: "Luca Brasi sleeps with the πππ." + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF8 idx: 0 5 11 18 23 27 3135 39 + // UTF16 idx: 0 5 11 18 23 27 2931 33 + // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "π", "π" + // and "π". + constexpr std::string_view kSicilianMessage = + "Luca Brasi sleeps with the πππ."; + DocumentProto document = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", kSicilianMessage) + .Build(); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "Some other content.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // Search and request snippet matching but no windowing. + SearchSpecProto search_spec; + search_spec.set_query("?"); + search_spec.set_term_match_type(MATCH_PREFIX); + ScoringSpecProto scoring_spec; + ResultSpecProto result_spec; + + // Search and make sure that we got a single successful result + SearchResultProto search_results = + icing.Search(search_spec, scoring_spec, result_spec); + EXPECT_THAT(search_results.status(), ProtoIsOk()); + EXPECT_THAT(search_results.results(), SizeIs(2)); + + search_spec.set_query("γ"); + search_results = icing.Search(search_spec, scoring_spec, result_spec); + EXPECT_THAT(search_results.status(), ProtoIsOk()); + EXPECT_THAT(search_results.results(), SizeIs(2)); + + search_spec.set_query("-"); + search_results = icing.Search(search_spec, scoring_spec, result_spec); + EXPECT_THAT(search_results.status(), ProtoIsOk()); + EXPECT_THAT(search_results.results(), SizeIs(2)); + + search_spec.set_query(":"); + search_results = icing.Search(search_spec, scoring_spec, result_spec); + EXPECT_THAT(search_results.status(), ProtoIsOk()); + EXPECT_THAT(search_results.results(), SizeIs(2)); + + search_spec.set_query("OR"); + search_results = icing.Search(search_spec, scoring_spec, result_spec); + EXPECT_THAT(search_results.status(), ProtoIsOk()); + EXPECT_THAT(search_results.results(), SizeIs(2)); + + search_spec.set_query(" "); + search_results = icing.Search(search_spec, scoring_spec, result_spec); + EXPECT_THAT(search_results.status(), ProtoIsOk()); + EXPECT_THAT(search_results.results(), SizeIs(2)); +} + +TEST_F(IcingSearchEngineTest, EmojiSnippetTest) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // String: "Luca Brasi sleeps with the πππ." + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF8 idx: 0 5 11 18 23 27 3135 39 + // UTF16 idx: 0 5 11 18 23 27 2931 33 + // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "π", "π" + // and "π". + constexpr std::string_view kSicilianMessage = + "Luca Brasi sleeps with the πππ."; + DocumentProto document = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", kSicilianMessage) + .Build(); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); + DocumentProto document_two = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "Some other content.") + .Build(); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + // Search and request snippet matching but no windowing. + SearchSpecProto search_spec; + search_spec.set_query("π"); + search_spec.set_term_match_type(MATCH_PREFIX); + + ResultSpecProto result_spec; + result_spec.mutable_snippet_spec()->set_num_to_snippet(1); + result_spec.mutable_snippet_spec()->set_num_matches_per_property(1); + + // Search and make sure that we got a single successful result + SearchResultProto search_results = icing.Search( + search_spec, ScoringSpecProto::default_instance(), result_spec); + ASSERT_THAT(search_results.status(), ProtoIsOk()); + ASSERT_THAT(search_results.results(), SizeIs(1)); + const SearchResultProto::ResultProto* result = &search_results.results(0); + EXPECT_THAT(result->document().uri(), Eq("uri1")); + + // Ensure that one and only one property was matched and it was "body" + ASSERT_THAT(result->snippet().entries(), SizeIs(1)); + const SnippetProto::EntryProto* entry = &result->snippet().entries(0); + EXPECT_THAT(entry->property_name(), Eq("body")); + + // Get the content for "subject" and see what the match is. + std::string_view content = GetString(&result->document(), "body"); + ASSERT_THAT(content, Eq(kSicilianMessage)); + + // Ensure that there is one and only one match within "subject" + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + + EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(27)); + EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(4)); + std::string_view match = + content.substr(match_proto.exact_match_byte_position(), + match_proto.exact_match_byte_length()); + ASSERT_THAT(match, Eq("π")); + + // Ensure that the utf-16 values are also as expected + EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(27)); + EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); +} + TEST_F(IcingSearchEngineTest, PutDocumentIndexFailureDeletion) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); diff --git a/icing/tokenization/combined-tokenizer_test.cc b/icing/tokenization/combined-tokenizer_test.cc new file mode 100644 index 0000000..0212e4f --- /dev/null +++ b/icing/tokenization/combined-tokenizer_test.cc @@ -0,0 +1,232 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string_view> +#include <vector> + +#include "testing/base/public/gmock.h" +#include "testing/base/public/gunit.h" +#include "third_party/icing/portable/platform.h" +#include "third_party/icing/proto/schema_proto_portable.pb.h" +#include "third_party/icing/testing/common-matchers.h" +#include "third_party/icing/testing/icu-data-file-helper.h" +#include "third_party/icing/testing/jni-test-helpers.h" +#include "third_party/icing/testing/test-data.h" +#include "third_party/icing/tokenization/language-segmenter-factory.h" +#include "third_party/icing/tokenization/language-segmenter.h" +#include "third_party/icing/tokenization/tokenizer-factory.h" +#include "third_party/icing/tokenization/tokenizer.h" +#include "third_party/icu/include/unicode/uloc.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::ElementsAre; + +// This test exists to ensure that the different tokenizers treat different +// segments of text in the same manner. +class CombinedTokenizerTest : public ::testing::Test { + protected: + void SetUp() override { + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //third_party/icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("third_party/icing/icu.dat"))); + } + jni_cache_ = GetTestJniCache(); + + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(options))); + } + + std::unique_ptr<const JniCache> jni_cache_; + std::unique_ptr<LanguageSegmenter> lang_segmenter_; +}; + +std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) { + std::vector<std::string> terms; + terms.reserve(tokens.size()); + for (const Token& token : tokens) { + if (token.type == Token::Type::REGULAR) { + terms.push_back(std::string(token.text)); + } + } + return terms; +} + +} // namespace + +TEST_F(CombinedTokenizerTest, SpecialCharacters) { + const std::string_view kText = "π Hello! Goodbye?"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> indexing_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> query_tokenizer, + CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, + lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, + indexing_tokenizer->TokenizeAll(kText)); + std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); + EXPECT_THAT(indexing_terms, ElementsAre("π", "Hello", "Goodbye")); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, + query_tokenizer->TokenizeAll(kText)); + std::vector<std::string> query_terms = GetTokenTerms(query_tokens); + EXPECT_THAT(query_terms, ElementsAre("π", "Hello", "Goodbye")); +} + +TEST_F(CombinedTokenizerTest, Parentheses) { + const std::string_view kText = "((paren1)(paren2) (last paren))"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> indexing_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> query_tokenizer, + CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, + lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, + indexing_tokenizer->TokenizeAll(kText)); + std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); + EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren")); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, + query_tokenizer->TokenizeAll(kText)); + std::vector<std::string> query_terms = GetTokenTerms(query_tokens); + EXPECT_THAT(query_terms, ElementsAre("paren1", "paren2", "last", "paren")); +} + +TEST_F(CombinedTokenizerTest, Negation) { + const std::string_view kText = "-foo -bar -baz"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> indexing_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> query_tokenizer, + CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, + lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, + indexing_tokenizer->TokenizeAll(kText)); + std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); + EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz")); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, + query_tokenizer->TokenizeAll(kText)); + std::vector<std::string> query_terms = GetTokenTerms(query_tokens); + EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz")); +} + +TEST_F(CombinedTokenizerTest, Colons) { + const std::string_view kText = ":foo: :bar baz:"; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> indexing_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> query_tokenizer, + CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, + lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, + indexing_tokenizer->TokenizeAll(kText)); + std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); + EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz")); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, + query_tokenizer->TokenizeAll(kText)); + std::vector<std::string> query_terms = GetTokenTerms(query_tokens); + EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz")); +} + +TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> indexing_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> query_tokenizer, + CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, + lang_segmenter_.get())); + + // This is a difference between the two tokenizers. "foo:bar" is a single + // token to the plain tokenizer because ':' is a word connector. But "foo:bar" + // is a property restrict to the query tokenizer - so "foo" is the property + // and "bar" is the only text term. + constexpr std::string_view kText = "foo:bar"; + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, + indexing_tokenizer->TokenizeAll(kText)); + std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); + EXPECT_THAT(indexing_terms, ElementsAre("foo:bar")); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, + query_tokenizer->TokenizeAll(kText)); + std::vector<std::string> query_terms = GetTokenTerms(query_tokens); + EXPECT_THAT(query_terms, ElementsAre("bar")); + + // This difference, however, should only apply to the first ':'. A + // second ':' should be treated by both tokenizers as a word connector. + constexpr std::string_view kText2 = "foo:bar:baz"; + ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens, + indexing_tokenizer->TokenizeAll(kText2)); + indexing_terms = GetTokenTerms(indexing_tokens); + EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz")); + + ICING_ASSERT_OK_AND_ASSIGN(query_tokens, + query_tokenizer->TokenizeAll(kText2)); + query_terms = GetTokenTerms(query_tokens); + EXPECT_THAT(query_terms, ElementsAre("bar:baz")); +} + +TEST_F(CombinedTokenizerTest, Punctuation) { + const std::string_view kText = "Who? What!? Why & How."; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> indexing_tokenizer, + tokenizer_factory::CreateIndexingTokenizer( + StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> query_tokenizer, + CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY, + lang_segmenter_.get())); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens, + indexing_tokenizer->TokenizeAll(kText)); + std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens); + EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How")); + + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, + query_tokenizer->TokenizeAll(kText)); + std::vector<std::string> query_terms = GetTokenTerms(query_tokens); + EXPECT_THAT(query_terms, ElementsAre("Who", "What", "Why", "How")); +} + +} // namespace lib +} // namespace icing diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index 8e0f789..dc7b0a4 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -64,30 +64,26 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Advances to the next term. Returns false if it has reached the end. bool Advance() override { - while (true) { - // Prerequisite check - if (term_end_index_exclusive_ == UBRK_DONE) { - return false; - } - - if (term_end_index_exclusive_ == 0) { - // First Advance() call - term_start_index_ = ubrk_first(break_iterator_); - } else { - term_start_index_ = term_end_index_exclusive_; - } - term_end_index_exclusive_ = ubrk_next(break_iterator_); + // Prerequisite check + if (term_end_index_exclusive_ == UBRK_DONE) { + return false; + } - // Reached the end - if (term_end_index_exclusive_ == UBRK_DONE) { - MarkAsDone(); - return false; - } + if (term_end_index_exclusive_ == 0) { + // First Advance() call + term_start_index_ = ubrk_first(break_iterator_); + } else { + term_start_index_ = term_end_index_exclusive_; + } + term_end_index_exclusive_ = ubrk_next(break_iterator_); - if (IsValidSegment()) { - return true; - } + // Reached the end + if (term_end_index_exclusive_ == UBRK_DONE) { + MarkAsDone(); + return false; } + + return true; } // Returns the current term. It can be called only when Advance() returns @@ -227,8 +223,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return absl_ports::AbortedError( "Could not retrieve valid utf8 character!"); } - if (term_end_index_exclusive_ > offset_iterator_.utf8_index() || - !IsValidSegment()) { + if (term_end_index_exclusive_ > offset_iterator_.utf8_index()) { return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index()); } return term_start_iterator.utf32_index(); @@ -295,24 +290,6 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { term_start_index_ = 0; } - bool IsValidSegment() const { - // Rule 1: all ASCII terms will be returned. - // We know it's a ASCII term by checking the first char. - if (i18n_utils::IsAscii(text_[term_start_index_])) { - return true; - } - - UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), - term_start_index_); - // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned. - // We know it's an alphanumeric term by checking the first unicode - // character. - if (i18n_utils::IsAlphaNumeric(uchar32)) { - return true; - } - return false; - } - // The underlying class that does the segmentation, ubrk_close() must be // called after using. UBreakIterator* break_iterator_; diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index fe0b96e..4098be5 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -191,7 +191,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) { // Full-width (non-ASCII) punctuation marks and special characters are left // out. EXPECT_THAT(language_segmenter->GetAllTerms("γοΌΒ·HelloοΌΓ"), - IsOkAndHolds(ElementsAre("Hello"))); + IsOkAndHolds(ElementsAre("γ", "οΌ", "Β·", "Hello", "οΌ", "Γ"))); } TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) { @@ -252,9 +252,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { // Connectors don't connect if one side is an invalid term (οΌ) EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:οΌ"), - IsOkAndHolds(ElementsAre("bar:baz", ":"))); + IsOkAndHolds(ElementsAre("bar:baz", ":", "οΌ"))); EXPECT_THAT(language_segmenter->GetAllTerms("οΌ:bar:baz"), - IsOkAndHolds(ElementsAre(":", "bar:baz"))); + IsOkAndHolds(ElementsAre("οΌ", ":", "bar:baz"))); EXPECT_THAT(language_segmenter->GetAllTerms("3:14"), IsOkAndHolds(ElementsAre("3", ":", "14"))); EXPECT_THAT(language_segmenter->GetAllTerms("η§:γ―"), @@ -417,15 +417,16 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) { // have whitespaces as word delimiter. // Chinese - EXPECT_THAT(language_segmenter->GetAllTerms("ζζ―倩衰路ε»δΈηγ"), - IsOkAndHolds(ElementsAre("ζ", "ζ―倩", "θ΅°θ·―", "ε»", "δΈη"))); + EXPECT_THAT( + language_segmenter->GetAllTerms("ζζ―倩衰路ε»δΈηγ"), + IsOkAndHolds(ElementsAre("ζ", "ζ―倩", "θ΅°θ·―", "ε»", "δΈη", "γ"))); // Japanese EXPECT_THAT(language_segmenter->GetAllTerms("η§γ―ζ―ζ₯δ»δΊγ«ζ©γγ¦γγΎγγ"), IsOkAndHolds(ElementsAre("η§", "γ―", "ζ―ζ₯", "δ»δΊ", "γ«", "ζ©", - "γ", "γ¦γ", "γΎγ"))); + "γ", "γ¦γ", "γΎγ", "γ"))); // Khmer EXPECT_THAT(language_segmenter->GetAllTerms("αα»αααΎααα
ααααΎααΆαααΆααααααα"), - IsOkAndHolds(ElementsAre("αα»α", "ααΎααα
", "ααααΎααΆα", "ααΆαααααα"))); + IsOkAndHolds(ElementsAre("αα»α", "ααΎααα
", "ααααΎααΆα", "ααΆαααααα", "α"))); // Thai EXPECT_THAT( language_segmenter->GetAllTerms("ΰΈΰΈ±ΰΈΰΉΰΈΰΈ΄ΰΈΰΉΰΈΰΈΰΈ³ΰΈΰΈ²ΰΈΰΈΰΈΈΰΈΰΈ§ΰΈ±ΰΈ"), @@ -858,16 +859,19 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kChinese)); // String: "ζζ―倩衰路ε»δΈηγ" - // ^ ^ ^ ^^ - // UTF-8 idx: 0 3 9 15 18 - // UTF-832 idx: 0 1 3 5 6 + // ^ ^ ^ ^^ ^ + // UTF-8 idx: 0 3 9 15 18 24 + // UTF-832 idx: 0 1 3 5 6 8 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("ζ―倩")); EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("θ΅°θ·―")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->GetTerm(), Eq("γ")); + + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } @@ -882,18 +886,21 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kJapanese)); // String: "η§γ―ζ―ζ₯δ»δΊγ«ζ©γγ¦γγΎγγ" - // ^ ^ ^ ^ ^ ^ ^ ^ ^ - // UTF-8 idx: 0 3 6 12 18212427 33 - // UTF-32 idx: 0 1 2 4 6 7 8 9 11 + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 6 12 18212427 33 39 + // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("γ―")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4))); EXPECT_THAT(itr->GetTerm(), Eq("δ»δΊ")); + + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13))); + EXPECT_THAT(itr->GetTerm(), Eq("γ")); } TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) { @@ -905,13 +912,16 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kKhmer)); // String: "αα»αααΎααα
ααααΎααΆαααΆααααααα" - // ^ ^ ^ ^ - // UTF-8 idx: 0 9 24 45 - // UTF-32 idx: 0 3 8 15 + // ^ ^ ^ ^ ^ + // UTF-8 idx: 0 9 24 45 69 + // UTF-32 idx: 0 3 8 15 23 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("ααΎααα
")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23))); + EXPECT_THAT(itr->GetTerm(), Eq("α")); + + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc index 8a27103..ff449a7 100644 --- a/icing/tokenization/raw-query-tokenizer.cc +++ b/icing/tokenization/raw-query-tokenizer.cc @@ -102,7 +102,7 @@ enum State { // When seeing right parentheses CLOSING_PARENTHESES = 8, - PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9, + PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9, PROCESSING_PROPERTY_TERM_APPENDING = 10, @@ -119,7 +119,7 @@ enum TermType { // A term that consists of unicode alphabetic and numeric characters ASCII_ALPHANUMERIC_TERM = 1, - NON_ASCII_ALPHABETIC_TERM = 2, + NON_ASCII_ALPHANUMERIC_TERM = 2, // "(" LEFT_PARENTHESES = 3, @@ -208,7 +208,7 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) { // PROCESSING_OR = 6 // OPENING_PARENTHESES = 7 // CLOSING_PARENTHESES = 8 -// PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9 +// PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9 // PROCESSING_PROPERTY_TERM_APPENDING = 10 // // Actions: @@ -252,40 +252,40 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) { // like "+", "&", "@", "#" in indexing and query tokenizers. constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = { /*State: Ready*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, READY, READY}, /*State: PROCESSING_ALPHANUMERIC_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_RESTRICT, READY}, /*State: PROCESSING_EXCLUSION*/ {READY, PROCESSING_EXCLUSION_TERM, PROCESSING_EXCLUSION_TERM, INVALID, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, INVALID, INVALID, READY}, /*State: PROCESSING_EXCLUSION_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY}, /*State: PROCESSING_PROPERTY_RESTRICT*/ {READY, PROCESSING_PROPERTY_TERM, PROCESSING_PROPERTY_TERM, INVALID, CLOSING_PARENTHESES, INVALID, INVALID, PROCESSING_PROPERTY_RESTRICT, READY}, /*State: PROCESSING_PROPERTY_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_TERM_APPENDING, READY}, /*State: PROCESSING_OR*/ {READY, INVALID, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID, INVALID, INVALID, READY}, /*State: OPENING_PARENTHESES*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, OPENING_PARENTHESES, READY, READY}, /*State: CLOSING_PARENTHESES*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, INVALID, READY}, - /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/ - {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM, + /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/ + {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY}, /*State: PROCESSING_PROPERTY_TERM_APPENDING*/ {READY, PROCESSING_PROPERTY_TERM_APPENDING, @@ -326,7 +326,7 @@ constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = { /*State: CLOSING_PARENTHESES*/ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT}, - /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/ + /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NON_ASCII_AS_PROPERTY_NAME, OUTPUT}, /*State: PROCESSING_PROPERTY_TERM_APPENDING*/ @@ -345,6 +345,40 @@ std::pair<TermType, std::string_view> GetWhitespaceTerm(std::string_view text, return std::make_pair(WHITESPACE, text.substr(pos, cur - pos)); } +TermType GetContentTermType(std::string_view text, size_t pos) { + if (i18n_utils::IsPunctuationAt(text, pos)) { + return OTHER; + } else if (i18n_utils::IsAscii(text[pos])) { + return ASCII_ALPHANUMERIC_TERM; + } + return NON_ASCII_ALPHANUMERIC_TERM; +} + +bool IsContentTermType(TermType term_type) { + switch (term_type) { + case ASCII_ALPHANUMERIC_TERM: + [[fallthrough]]; + case NON_ASCII_ALPHANUMERIC_TERM: + [[fallthrough]]; + case OTHER: + return true; + case WHITESPACE: + [[fallthrough]]; + case LEFT_PARENTHESES: + [[fallthrough]]; + case RIGHT_PARENTHESES: + [[fallthrough]]; + case EXCLUSION_OPERATOR: + [[fallthrough]]; + case OR_OPERATOR: + [[fallthrough]]; + case COLON: + [[fallthrough]]; + case TYPE_COUNT: + return false; + } +} + // Determines the length of the potential content term beginning at text[pos] // and returns a pair with the appropriate TermType and a string_view of the // content term. @@ -357,12 +391,7 @@ std::pair<TermType, std::string_view> GetContentTerm(std::string_view text, size_t pos) { size_t len = 0; // Checks the first char to see if it's an ASCII term - TermType type = ASCII_ALPHANUMERIC_TERM; - if (!i18n_utils::IsAscii(text[pos])) { - type = NON_ASCII_ALPHABETIC_TERM; - } else if (std::isalnum(text[pos])) { - type = OTHER; - } + TermType type = GetContentTermType(text, pos); for (size_t cur = pos; cur < text.length() && len == 0; ++cur) { switch (text[cur]) { case kLeftParentheses: @@ -470,7 +499,7 @@ libtextclassifier3::Status OutputToken(State new_state, switch (current_term_type) { case ASCII_ALPHANUMERIC_TERM: [[fallthrough]]; - case NON_ASCII_ALPHABETIC_TERM: + case NON_ASCII_ALPHANUMERIC_TERM: if (new_state == PROCESSING_PROPERTY_TERM) { // Asserts extra rule 1: each property name in the property path is a // valid term. @@ -540,10 +569,8 @@ libtextclassifier3::Status ProcessTerm( ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> content_terms, language_segmenter->GetAllTerms(*current_term)); for (std::string_view term : content_terms) { - TermType type = ASCII_ALPHANUMERIC_TERM; - if (!i18n_utils::IsAscii(term[0])) { - type = NON_ASCII_ALPHABETIC_TERM; - } else if (!std::isalnum(term[0])) { + TermType type = GetContentTermType(term, 0); + if (type == OTHER) { // Skip OTHER tokens here. continue; } @@ -589,9 +616,7 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms( for (int i = 0; i < prescanned_terms.size(); ++i) { const std::pair<TermType, std::string_view>& prescanned_term = prescanned_terms.at(i); - if (prescanned_term.first != ASCII_ALPHANUMERIC_TERM && - prescanned_term.first != NON_ASCII_ALPHABETIC_TERM && - prescanned_term.first != OTHER) { + if (!IsContentTermType(prescanned_term.first)) { // This can't be a property restrict. Just pass it in. ICING_RETURN_IF_ERROR( ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, @@ -603,18 +628,15 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms( std::vector<std::string_view> content_terms, language_segmenter->GetAllTerms(prescanned_term.second)); for (std::string_view term : content_terms) { - TermType type = ASCII_ALPHANUMERIC_TERM; + TermType type = GetContentTermType(term, 0); if (term == kOrOperator) { // TODO(tjbarron) Decide whether we should revise this and other // handled syntax. This is used to allow queries like "term1,OR,term2" // to succeed. It's not clear if we should allow this or require // clients to ensure that OR operators are always surrounded by // whitespace. + // Override the type if this is actually an OR operator. type = OR_OPERATOR; - } else if (!i18n_utils::IsAscii(term[0])) { - type = NON_ASCII_ALPHABETIC_TERM; - } else if (!std::isalnum(term[0])) { - type = OTHER; } ICING_RETURN_IF_ERROR(ProcessTerm(¤t_state, ¤t_term, ¤t_term_type, diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index c6d981d..b1dcc73 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -70,6 +70,29 @@ TEST_F(RawQueryTokenizerTest, Simple) { EqualsToken(Token::Type::REGULAR, "WORLD")))); } +TEST_F(RawQueryTokenizerTest, Emoji) { + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create(std::move(options))); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Tokenizer> raw_query_tokenizer, + tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, + language_segmenter.get())); + + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("π Hello! Goodbye?"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "π"), + EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "Goodbye")))); + + EXPECT_THAT( + raw_query_tokenizer->TokenizeAll("Helloπ ! Goodbye?"), + IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"), + EqualsToken(Token::Type::REGULAR, "π"), + EqualsToken(Token::Type::REGULAR, "Goodbye")))); +} + TEST_F(RawQueryTokenizerTest, Parentheses) { language_segmenter_factory::SegmenterOptions options(ULOC_US); ICING_ASSERT_OK_AND_ASSIGN( @@ -80,26 +103,35 @@ TEST_F(RawQueryTokenizerTest, Parentheses) { tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY, language_segmenter.get())); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens, + raw_query_tokenizer->TokenizeAll("()")); + EXPECT_THAT( + query_tokens, + ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + ICING_ASSERT_OK_AND_ASSIGN(query_tokens, + raw_query_tokenizer->TokenizeAll("( )")); + EXPECT_THAT( + query_tokens, + ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + ICING_ASSERT_OK_AND_ASSIGN(query_tokens, + raw_query_tokenizer->TokenizeAll("(term1 term2)")); + EXPECT_THAT( + query_tokens, + ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), EqualsToken(Token::Type::REGULAR, "term1"), EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + ICING_ASSERT_OK_AND_ASSIGN( + query_tokens, + raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))")); + EXPECT_THAT( + query_tokens, + ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), EqualsToken(Token::Type::REGULAR, "term1"), EqualsToken(Token::Type::REGULAR, "term2"), @@ -108,21 +140,24 @@ TEST_F(RawQueryTokenizerTest, Parentheses) { EqualsToken(Token::Type::REGULAR, "term3"), EqualsToken(Token::Type::REGULAR, "term4"), EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::REGULAR, "term1"), + ICING_ASSERT_OK_AND_ASSIGN(query_tokens, + raw_query_tokenizer->TokenizeAll("term1(term2)")); + EXPECT_THAT( + query_tokens, + ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"), EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), EqualsToken(Token::Type::REGULAR, "term2"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")))); - - EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)term2"), - IsOkAndHolds(ElementsAre( - EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term1"), - EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), - EqualsToken(Token::Type::REGULAR, "term2")))); + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))); + + ICING_ASSERT_OK_AND_ASSIGN(query_tokens, + raw_query_tokenizer->TokenizeAll("(term1)term2")); + EXPECT_THAT(query_tokens, + ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term1"), + EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""), + EqualsToken(Token::Type::REGULAR, "term2"))); EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"), IsOkAndHolds(ElementsAre( diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index cb474c6..e5de6e6 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -43,46 +43,38 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Advances to the next term. Returns false if it has reached the end. bool Advance() override { - while (true) { - // Prerequisite check - if (IsDone()) { - return false; - } - - if (term_end_exclusive_.utf16_index() == 0) { - int first = break_iterator_->First(); - if (!term_start_.MoveToUtf16(first)) { - // First is guaranteed to succeed and return a position within bonds. - // So the only possible failure could be an invalid sequence. Mark as - // DONE and return. - MarkAsDone(); - return false; - } - } else { - term_start_ = term_end_exclusive_; - } + // Prerequisite check + if (IsDone()) { + return false; + } - int next_utf16_index_exclusive = break_iterator_->Next(); - // Reached the end - if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) { - MarkAsDone(); - return false; - } - if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) { - // next_utf16_index_exclusive is guaranteed to be within bonds thanks to - // the check for kDone above. So the only possible failure could be an - // invalid sequence. Mark as DONE and return. + if (term_end_exclusive_.utf16_index() == 0) { + int first = break_iterator_->First(); + if (!term_start_.MoveToUtf16(first)) { + // First is guaranteed to succeed and return a position within bonds. + // So the only possible failure could be an invalid sequence. Mark as + // DONE and return. MarkAsDone(); return false; } + } else { + term_start_ = term_end_exclusive_; + } - // Check if the current term is valid. We consider any term valid if its - // first character is valid. If it's not valid, then we need to advance to - // the next term. - if (IsValidTerm()) { - return true; - } + int next_utf16_index_exclusive = break_iterator_->Next(); + // Reached the end + if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) { + MarkAsDone(); + return false; } + if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) { + // next_utf16_index_exclusive is guaranteed to be within bonds thanks to + // the check for kDone above. So the only possible failure could be an + // invalid sequence. Mark as DONE and return. + MarkAsDone(); + return false; + } + return true; } // Returns the current term. It can be called only when Advance() returns @@ -245,7 +237,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // 4. The start and end indices point to a segment, but we need to ensure // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll // need a segment prior to this one. - if (term_end_exclusive_.utf32_index() > offset || !IsValidTerm()) { + if (term_end_exclusive_.utf32_index() > offset) { return ResetToTermEndingBeforeUtf32(term_start_.utf32_index()); } return term_start_.utf32_index(); @@ -285,24 +277,6 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone; } - bool IsValidTerm() const { - // Rule 1: all ASCII terms will be returned. - // We know it's a ASCII term by checking the first char. - if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) { - return true; - } - - UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), - term_start_.utf8_index()); - // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned. - // We know it's an alphanumeric term by checking the first unicode - // character. - if (i18n_utils::IsAlphaNumeric(uchar32)) { - return true; - } - return false; - } - // All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So // this class needs to maintain state to convert between UTF-16 and UTF-8. std::unique_ptr<ReverseJniBreakIterator> break_iterator_; diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc index 45d6475..277ece6 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc @@ -185,7 +185,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) { // Full-width (non-ASCII) punctuation marks and special characters are left // out. EXPECT_THAT(language_segmenter->GetAllTerms("γοΌΒ·HelloοΌΓ"), - IsOkAndHolds(ElementsAre("Hello"))); + IsOkAndHolds(ElementsAre("γ", "οΌ", "Β·", "Hello", "οΌ", "Γ"))); } TEST_P(ReverseJniLanguageSegmenterTest, Acronym) { @@ -246,9 +246,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) { // Connectors don't connect if one side is an invalid term (οΌ) EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:οΌ"), - IsOkAndHolds(ElementsAre("bar:baz", ":"))); + IsOkAndHolds(ElementsAre("bar:baz", ":", "οΌ"))); EXPECT_THAT(language_segmenter->GetAllTerms("οΌ:bar:baz"), - IsOkAndHolds(ElementsAre(":", "bar:baz"))); + IsOkAndHolds(ElementsAre("οΌ", ":", "bar:baz"))); EXPECT_THAT(language_segmenter->GetAllTerms("3:14"), IsOkAndHolds(ElementsAre("3", ":", "14"))); EXPECT_THAT(language_segmenter->GetAllTerms("η§:γ―"), @@ -413,15 +413,17 @@ TEST_P(ReverseJniLanguageSegmenterTest, CJKT) { // have whitespaces as word delimiter. // Chinese - EXPECT_THAT(language_segmenter->GetAllTerms("ζζ―倩衰路ε»δΈηγ"), - IsOkAndHolds(ElementsAre("ζ", "ζ―倩", "θ΅°θ·―", "ε»", "δΈη"))); + EXPECT_THAT( + language_segmenter->GetAllTerms("ζζ―倩衰路ε»δΈηγ"), + IsOkAndHolds(ElementsAre("ζ", "ζ―倩", "θ΅°θ·―", "ε»", "δΈη", "γ"))); // Japanese EXPECT_THAT(language_segmenter->GetAllTerms("η§γ―ζ―ζ₯δ»δΊγ«ζ©γγ¦γγΎγγ"), IsOkAndHolds(ElementsAre("η§", "γ―", "ζ―ζ₯", "δ»δΊ", "γ«", "ζ©", - "γ", "γ¦γ", "γΎγ"))); + "γ", "γ¦γ", "γΎγ", "γ"))); // Khmer EXPECT_THAT(language_segmenter->GetAllTerms("αα»αααΎααα
ααααΎααΆαααΆααααααα"), - IsOkAndHolds(ElementsAre("αα»α", "ααΎααα
", "ααααΎααΆα", "ααΆαααααα"))); + IsOkAndHolds(ElementsAre("αα»α", "ααΎααα
", "ααααΎααΆα", "ααΆαααααα", "α"))); + // Thai EXPECT_THAT( language_segmenter->GetAllTerms("ΰΈΰΈ±ΰΈΰΉΰΈΰΈ΄ΰΈΰΉΰΈΰΈΰΈ³ΰΈΰΈ²ΰΈΰΈΰΈΈΰΈΰΈ§ΰΈ±ΰΈ"), @@ -852,16 +854,19 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kChinese)); // String: "ζζ―倩衰路ε»δΈηγ" - // ^ ^ ^ ^^ - // UTF-8 idx: 0 3 9 15 18 - // UTF-832 idx: 0 1 3 5 6 + // ^ ^ ^ ^^ ^ + // UTF-8 idx: 0 3 9 15 18 24 + // UTF-832 idx: 0 1 3 5 6 8 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("ζ―倩")); EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("θ΅°θ·―")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->GetTerm(), Eq("γ")); + + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } @@ -876,18 +881,21 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kJapanese)); // String: "η§γ―ζ―ζ₯δ»δΊγ«ζ©γγ¦γγΎγγ" - // ^ ^ ^ ^ ^ ^ ^ ^ ^ - // UTF-8 idx: 0 3 6 12 18212427 33 - // UTF-32 idx: 0 1 2 4 6 7 8 9 11 + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 6 12 18212427 33 39 + // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("γ―")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4))); EXPECT_THAT(itr->GetTerm(), Eq("δ»δΊ")); + + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13))); + EXPECT_THAT(itr->GetTerm(), Eq("γ")); } TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) { @@ -899,13 +907,16 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kKhmer)); // String: "αα»αααΎααα
ααααΎααΆαααΆααααααα" - // ^ ^ ^ ^ - // UTF-8 idx: 0 9 24 45 - // UTF-32 idx: 0 3 8 15 + // ^ ^ ^ ^ ^ + // UTF-8 idx: 0 9 24 45 69 + // UTF-32 idx: 0 3 8 15 23 EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("ααΎααα
")); - EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23))); + EXPECT_THAT(itr->GetTerm(), Eq("α")); + + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h index 2bc18cc..24f8269 100644 --- a/icing/tokenization/tokenizer.h +++ b/icing/tokenization/tokenizer.h @@ -40,15 +40,6 @@ class Tokenizer { public: virtual ~Tokenizer() = default; - enum Type { - // Index tokenizers - PLAIN, // Used to tokenize plain text input - VERBATIM, // Used to tokenize the input text in verbatim - - // Query tokenizers - RAW_QUERY, // Used to tokenize raw queries - }; - // An iterator helping to get tokens. // Example usage: // |