aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2022-05-12 20:16:54 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2022-05-12 20:16:54 +0000
commite8614c08278e7779449636fd9627ddec63fe8263 (patch)
tree7642107143cc381a8442c42b110d685ee0fa1b6d
parenteeda28f34322d469f05595ad5255888b52d5e5de (diff)
parent8c72386649cb1f0950a4cb24cbaca2ccf16abd91 (diff)
downloadicing-e8614c08278e7779449636fd9627ddec63fe8263.tar.gz
Merge "Stop dropping nonascii/nonalnum segments." into tm-dev am: 34f7ab9226 am: 8c72386649
Original change: https://googleplex-android-review.googlesource.com/c/platform/external/icing/+/18328740 Change-Id: I41a3f5bd90f63cf98493b45ae6894f80c2e21569 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r--icing/icing-search-engine_test.cc135
-rw-r--r--icing/tokenization/combined-tokenizer_test.cc232
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc59
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc48
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc84
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc89
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc80
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc49
-rw-r--r--icing/tokenization/tokenizer.h9
9 files changed, 586 insertions, 199 deletions
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 7ed8885..5244f4c 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -8082,6 +8082,141 @@ TEST_F(IcingSearchEngineTest, CJKSnippetTest) {
EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
}
+TEST_F(IcingSearchEngineTest, InvalidToEmptyQueryTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // String: "Luca Brasi sleeps with the 🐟🐟🐟."
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF8 idx: 0 5 11 18 23 27 3135 39
+ // UTF16 idx: 0 5 11 18 23 27 2931 33
+ // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟"
+ // and "🐟".
+ constexpr std::string_view kSicilianMessage =
+ "Luca Brasi sleeps with the 🐟🐟🐟.";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kSicilianMessage)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "Some other content.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto search_spec;
+ search_spec.set_query("?");
+ search_spec.set_term_match_type(MATCH_PREFIX);
+ ScoringSpecProto scoring_spec;
+ ResultSpecProto result_spec;
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query("。");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query("-");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query(":");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query("OR");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query(" ");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+}
+
+TEST_F(IcingSearchEngineTest, EmojiSnippetTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // String: "Luca Brasi sleeps with the 🐟🐟🐟."
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF8 idx: 0 5 11 18 23 27 3135 39
+ // UTF16 idx: 0 5 11 18 23 27 2931 33
+ // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟"
+ // and "🐟".
+ constexpr std::string_view kSicilianMessage =
+ "Luca Brasi sleeps with the 🐟🐟🐟.";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kSicilianMessage)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "Some other content.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto search_spec;
+ search_spec.set_query("🐟");
+ search_spec.set_term_match_type(MATCH_PREFIX);
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results = icing.Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ ASSERT_THAT(search_results.status(), ProtoIsOk());
+ ASSERT_THAT(search_results.results(), SizeIs(1));
+ const SearchResultProto::ResultProto* result = &search_results.results(0);
+ EXPECT_THAT(result->document().uri(), Eq("uri1"));
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(result->snippet().entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("body"));
+
+ // Get the content for "subject" and see what the match is.
+ std::string_view content = GetString(&result->document(), "body");
+ ASSERT_THAT(content, Eq(kSicilianMessage));
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(27));
+ EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(4));
+ std::string_view match =
+ content.substr(match_proto.exact_match_byte_position(),
+ match_proto.exact_match_byte_length());
+ ASSERT_THAT(match, Eq("🐟"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(27));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
TEST_F(IcingSearchEngineTest, PutDocumentIndexFailureDeletion) {
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
diff --git a/icing/tokenization/combined-tokenizer_test.cc b/icing/tokenization/combined-tokenizer_test.cc
new file mode 100644
index 0000000..0212e4f
--- /dev/null
+++ b/icing/tokenization/combined-tokenizer_test.cc
@@ -0,0 +1,232 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string_view>
+#include <vector>
+
+#include "testing/base/public/gmock.h"
+#include "testing/base/public/gunit.h"
+#include "third_party/icing/portable/platform.h"
+#include "third_party/icing/proto/schema_proto_portable.pb.h"
+#include "third_party/icing/testing/common-matchers.h"
+#include "third_party/icing/testing/icu-data-file-helper.h"
+#include "third_party/icing/testing/jni-test-helpers.h"
+#include "third_party/icing/testing/test-data.h"
+#include "third_party/icing/tokenization/language-segmenter-factory.h"
+#include "third_party/icing/tokenization/language-segmenter.h"
+#include "third_party/icing/tokenization/tokenizer-factory.h"
+#include "third_party/icing/tokenization/tokenizer.h"
+#include "third_party/icu/include/unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+
+// This test exists to ensure that the different tokenizers treat different
+// segments of text in the same manner.
+class CombinedTokenizerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //third_party/icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("third_party/icing/icu.dat")));
+ }
+ jni_cache_ = GetTestJniCache();
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+ }
+
+ std::unique_ptr<const JniCache> jni_cache_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+};
+
+std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
+ std::vector<std::string> terms;
+ terms.reserve(tokens.size());
+ for (const Token& token : tokens) {
+ if (token.type == Token::Type::REGULAR) {
+ terms.push_back(std::string(token.text));
+ }
+ }
+ return terms;
+}
+
+} // namespace
+
+TEST_F(CombinedTokenizerTest, SpecialCharacters) {
+ const std::string_view kText = "😊 Hello! Goodbye?";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("😊", "Hello", "Goodbye"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("😊", "Hello", "Goodbye"));
+}
+
+TEST_F(CombinedTokenizerTest, Parentheses) {
+ const std::string_view kText = "((paren1)(paren2) (last paren))";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("paren1", "paren2", "last", "paren"));
+}
+
+TEST_F(CombinedTokenizerTest, Negation) {
+ const std::string_view kText = "-foo -bar -baz";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
+}
+
+TEST_F(CombinedTokenizerTest, Colons) {
+ const std::string_view kText = ":foo: :bar baz:";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
+}
+
+TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ // This is a difference between the two tokenizers. "foo:bar" is a single
+ // token to the plain tokenizer because ':' is a word connector. But "foo:bar"
+ // is a property restrict to the query tokenizer - so "foo" is the property
+ // and "bar" is the only text term.
+ constexpr std::string_view kText = "foo:bar";
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("bar"));
+
+ // This difference, however, should only apply to the first ':'. A
+ // second ':' should be treated by both tokenizers as a word connector.
+ constexpr std::string_view kText2 = "foo:bar:baz";
+ ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText2));
+ indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ query_tokenizer->TokenizeAll(kText2));
+ query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("bar:baz"));
+}
+
+TEST_F(CombinedTokenizerTest, Punctuation) {
+ const std::string_view kText = "Who? What!? Why & How.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("Who", "What", "Why", "How"));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 8e0f789..dc7b0a4 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -64,30 +64,26 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
- while (true) {
- // Prerequisite check
- if (term_end_index_exclusive_ == UBRK_DONE) {
- return false;
- }
-
- if (term_end_index_exclusive_ == 0) {
- // First Advance() call
- term_start_index_ = ubrk_first(break_iterator_);
- } else {
- term_start_index_ = term_end_index_exclusive_;
- }
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
+ // Prerequisite check
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return false;
+ }
- // Reached the end
- if (term_end_index_exclusive_ == UBRK_DONE) {
- MarkAsDone();
- return false;
- }
+ if (term_end_index_exclusive_ == 0) {
+ // First Advance() call
+ term_start_index_ = ubrk_first(break_iterator_);
+ } else {
+ term_start_index_ = term_end_index_exclusive_;
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
- if (IsValidSegment()) {
- return true;
- }
+ // Reached the end
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
+ return false;
}
+
+ return true;
}
// Returns the current term. It can be called only when Advance() returns
@@ -227,8 +223,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return absl_ports::AbortedError(
"Could not retrieve valid utf8 character!");
}
- if (term_end_index_exclusive_ > offset_iterator_.utf8_index() ||
- !IsValidSegment()) {
+ if (term_end_index_exclusive_ > offset_iterator_.utf8_index()) {
return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
}
return term_start_iterator.utf32_index();
@@ -295,24 +290,6 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
term_start_index_ = 0;
}
- bool IsValidSegment() const {
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[term_start_index_])) {
- return true;
- }
-
- UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
- term_start_index_);
- // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned.
- // We know it's an alphanumeric term by checking the first unicode
- // character.
- if (i18n_utils::IsAlphaNumeric(uchar32)) {
- return true;
- }
- return false;
- }
-
// The underlying class that does the segmentation, ubrk_close() must be
// called after using.
UBreakIterator* break_iterator_;
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index fe0b96e..4098be5 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -191,7 +191,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("γ€‚οΌŸΒ·Hello!×"),
- IsOkAndHolds(ElementsAre("Hello")));
+ IsOkAndHolds(ElementsAre("。", "?", "Β·", "Hello", "!", "Γ—")));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
@@ -252,9 +252,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
// Connectors don't connect if one side is an invalid term (?)
EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"),
- IsOkAndHolds(ElementsAre("bar:baz", ":")));
+ IsOkAndHolds(ElementsAre("bar:baz", ":", "?")));
EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"),
- IsOkAndHolds(ElementsAre(":", "bar:baz")));
+ IsOkAndHolds(ElementsAre("?", ":", "bar:baz")));
EXPECT_THAT(language_segmenter->GetAllTerms("3:14"),
IsOkAndHolds(ElementsAre("3", ":", "14")));
EXPECT_THAT(language_segmenter->GetAllTerms("私:は"),
@@ -417,15 +417,16 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
// have whitespaces as word delimiter.
// Chinese
- EXPECT_THAT(language_segmenter->GetAllTerms("ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"),
- IsOkAndHolds(ElementsAre("ζˆ‘", "每倩", "θ΅°θ·―", "去", "上班")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"),
+ IsOkAndHolds(ElementsAre("ζˆ‘", "每倩", "θ΅°θ·―", "去", "上班", "。")));
// Japanese
EXPECT_THAT(language_segmenter->GetAllTerms("私は毎ζ—₯仕事に歩いています。"),
IsOkAndHolds(ElementsAre("私", "は", "毎ζ—₯", "δ»•δΊ‹", "に", "ζ­©",
- "い", "てい", "ます")));
+ "い", "てい", "ます", "。")));
// Khmer
EXPECT_THAT(language_segmenter->GetAllTerms("αž‰αž»αŸ†αžŠαžΎαžšαž‘αŸ…αž’αŸ’αžœαžΎαž€αžΆαžšαžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒαŸ”"),
- IsOkAndHolds(ElementsAre("αž‰αž»αŸ†", "αžŠαžΎαžšαž‘αŸ…", "αž’αŸ’αžœαžΎαž€αžΆαžš", "αžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒ")));
+ IsOkAndHolds(ElementsAre("αž‰αž»αŸ†", "αžŠαžΎαžšαž‘αŸ…", "αž’αŸ’αžœαžΎαž€αžΆαžš", "αžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒ", "αŸ”")));
// Thai
EXPECT_THAT(
language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
@@ -858,16 +859,19 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
// String: "ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"
- // ^ ^ ^ ^^
- // UTF-8 idx: 0 3 9 15 18
- // UTF-832 idx: 0 1 3 5 6
+ // ^ ^ ^ ^^ ^
+ // UTF-8 idx: 0 3 9 15 18 24
+ // UTF-832 idx: 0 1 3 5 6 8
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每倩"));
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("θ΅°θ·―"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
@@ -882,18 +886,21 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
// String: "私は毎ζ—₯仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // UTF-8 idx: 0 3 6 12 18212427 33
- // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33 39
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("δ»•δΊ‹"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
@@ -905,13 +912,16 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
// String: "αž‰αž»αŸ†αžŠαžΎαžšαž‘αŸ…αž’αŸ’αžœαžΎαž€αžΆαžšαžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒαŸ”"
- // ^ ^ ^ ^
- // UTF-8 idx: 0 9 24 45
- // UTF-32 idx: 0 3 8 15
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45 69
+ // UTF-32 idx: 0 3 8 15 23
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("αžŠαžΎαžšαž‘αŸ…"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("αŸ”"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 8a27103..ff449a7 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -102,7 +102,7 @@ enum State {
// When seeing right parentheses
CLOSING_PARENTHESES = 8,
- PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9,
PROCESSING_PROPERTY_TERM_APPENDING = 10,
@@ -119,7 +119,7 @@ enum TermType {
// A term that consists of unicode alphabetic and numeric characters
ASCII_ALPHANUMERIC_TERM = 1,
- NON_ASCII_ALPHABETIC_TERM = 2,
+ NON_ASCII_ALPHANUMERIC_TERM = 2,
// "("
LEFT_PARENTHESES = 3,
@@ -208,7 +208,7 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// PROCESSING_OR = 6
// OPENING_PARENTHESES = 7
// CLOSING_PARENTHESES = 8
-// PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9
+// PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9
// PROCESSING_PROPERTY_TERM_APPENDING = 10
//
// Actions:
@@ -252,40 +252,40 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// like "+", "&", "@", "#" in indexing and query tokenizers.
constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: Ready*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
PROCESSING_OR, READY, READY},
/*State: PROCESSING_ALPHANUMERIC_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID,
PROCESSING_PROPERTY_RESTRICT, READY},
/*State: PROCESSING_EXCLUSION*/
{READY, PROCESSING_EXCLUSION_TERM, PROCESSING_EXCLUSION_TERM, INVALID,
CLOSING_PARENTHESES, PROCESSING_EXCLUSION, INVALID, INVALID, READY},
/*State: PROCESSING_EXCLUSION_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
/*State: PROCESSING_PROPERTY_RESTRICT*/
{READY, PROCESSING_PROPERTY_TERM, PROCESSING_PROPERTY_TERM, INVALID,
CLOSING_PARENTHESES, INVALID, INVALID, PROCESSING_PROPERTY_RESTRICT,
READY},
/*State: PROCESSING_PROPERTY_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID,
PROCESSING_PROPERTY_TERM_APPENDING, READY},
/*State: PROCESSING_OR*/
{READY, INVALID, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID,
INVALID, INVALID, READY},
/*State: OPENING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
OPENING_PARENTHESES, READY, READY},
/*State: CLOSING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
PROCESSING_OR, INVALID, READY},
- /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
/*State: PROCESSING_PROPERTY_TERM_APPENDING*/
{READY, PROCESSING_PROPERTY_TERM_APPENDING,
@@ -326,7 +326,7 @@ constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: CLOSING_PARENTHESES*/
{OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT},
- /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/
+ /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/
{OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NON_ASCII_AS_PROPERTY_NAME, OUTPUT},
/*State: PROCESSING_PROPERTY_TERM_APPENDING*/
@@ -345,6 +345,40 @@ std::pair<TermType, std::string_view> GetWhitespaceTerm(std::string_view text,
return std::make_pair(WHITESPACE, text.substr(pos, cur - pos));
}
+TermType GetContentTermType(std::string_view text, size_t pos) {
+ if (i18n_utils::IsPunctuationAt(text, pos)) {
+ return OTHER;
+ } else if (i18n_utils::IsAscii(text[pos])) {
+ return ASCII_ALPHANUMERIC_TERM;
+ }
+ return NON_ASCII_ALPHANUMERIC_TERM;
+}
+
+bool IsContentTermType(TermType term_type) {
+ switch (term_type) {
+ case ASCII_ALPHANUMERIC_TERM:
+ [[fallthrough]];
+ case NON_ASCII_ALPHANUMERIC_TERM:
+ [[fallthrough]];
+ case OTHER:
+ return true;
+ case WHITESPACE:
+ [[fallthrough]];
+ case LEFT_PARENTHESES:
+ [[fallthrough]];
+ case RIGHT_PARENTHESES:
+ [[fallthrough]];
+ case EXCLUSION_OPERATOR:
+ [[fallthrough]];
+ case OR_OPERATOR:
+ [[fallthrough]];
+ case COLON:
+ [[fallthrough]];
+ case TYPE_COUNT:
+ return false;
+ }
+}
+
// Determines the length of the potential content term beginning at text[pos]
// and returns a pair with the appropriate TermType and a string_view of the
// content term.
@@ -357,12 +391,7 @@ std::pair<TermType, std::string_view> GetContentTerm(std::string_view text,
size_t pos) {
size_t len = 0;
// Checks the first char to see if it's an ASCII term
- TermType type = ASCII_ALPHANUMERIC_TERM;
- if (!i18n_utils::IsAscii(text[pos])) {
- type = NON_ASCII_ALPHABETIC_TERM;
- } else if (std::isalnum(text[pos])) {
- type = OTHER;
- }
+ TermType type = GetContentTermType(text, pos);
for (size_t cur = pos; cur < text.length() && len == 0; ++cur) {
switch (text[cur]) {
case kLeftParentheses:
@@ -470,7 +499,7 @@ libtextclassifier3::Status OutputToken(State new_state,
switch (current_term_type) {
case ASCII_ALPHANUMERIC_TERM:
[[fallthrough]];
- case NON_ASCII_ALPHABETIC_TERM:
+ case NON_ASCII_ALPHANUMERIC_TERM:
if (new_state == PROCESSING_PROPERTY_TERM) {
// Asserts extra rule 1: each property name in the property path is a
// valid term.
@@ -540,10 +569,8 @@ libtextclassifier3::Status ProcessTerm(
ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> content_terms,
language_segmenter->GetAllTerms(*current_term));
for (std::string_view term : content_terms) {
- TermType type = ASCII_ALPHANUMERIC_TERM;
- if (!i18n_utils::IsAscii(term[0])) {
- type = NON_ASCII_ALPHABETIC_TERM;
- } else if (!std::isalnum(term[0])) {
+ TermType type = GetContentTermType(term, 0);
+ if (type == OTHER) {
// Skip OTHER tokens here.
continue;
}
@@ -589,9 +616,7 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
for (int i = 0; i < prescanned_terms.size(); ++i) {
const std::pair<TermType, std::string_view>& prescanned_term =
prescanned_terms.at(i);
- if (prescanned_term.first != ASCII_ALPHANUMERIC_TERM &&
- prescanned_term.first != NON_ASCII_ALPHABETIC_TERM &&
- prescanned_term.first != OTHER) {
+ if (!IsContentTermType(prescanned_term.first)) {
// This can't be a property restrict. Just pass it in.
ICING_RETURN_IF_ERROR(
ProcessTerm(&current_state, &current_term, &current_term_type,
@@ -603,18 +628,15 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
std::vector<std::string_view> content_terms,
language_segmenter->GetAllTerms(prescanned_term.second));
for (std::string_view term : content_terms) {
- TermType type = ASCII_ALPHANUMERIC_TERM;
+ TermType type = GetContentTermType(term, 0);
if (term == kOrOperator) {
// TODO(tjbarron) Decide whether we should revise this and other
// handled syntax. This is used to allow queries like "term1,OR,term2"
// to succeed. It's not clear if we should allow this or require
// clients to ensure that OR operators are always surrounded by
// whitespace.
+ // Override the type if this is actually an OR operator.
type = OR_OPERATOR;
- } else if (!i18n_utils::IsAscii(term[0])) {
- type = NON_ASCII_ALPHABETIC_TERM;
- } else if (!std::isalnum(term[0])) {
- type = OTHER;
}
ICING_RETURN_IF_ERROR(ProcessTerm(&current_state, &current_term,
&current_term_type,
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index c6d981d..b1dcc73 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -70,6 +70,29 @@ TEST_F(RawQueryTokenizerTest, Simple) {
EqualsToken(Token::Type::REGULAR, "WORLD"))));
}
+TEST_F(RawQueryTokenizerTest, Emoji) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> raw_query_tokenizer,
+ tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
+ language_segmenter.get()));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("😊 Hello! Goodbye?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "😊"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "Goodbye"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("Hello😊 ! Goodbye?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "😊"),
+ EqualsToken(Token::Type::REGULAR, "Goodbye"))));
+}
+
TEST_F(RawQueryTokenizerTest, Parentheses) {
language_segmenter_factory::SegmenterOptions options(ULOC_US);
ICING_ASSERT_OK_AND_ASSIGN(
@@ -80,26 +103,35 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ raw_query_tokenizer->TokenizeAll("()"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("( )"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("(term1 term2)"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
EqualsToken(Token::Type::REGULAR, "term1"),
EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ ICING_ASSERT_OK_AND_ASSIGN(
+ query_tokens,
+ raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
EqualsToken(Token::Type::REGULAR, "term1"),
EqualsToken(Token::Type::REGULAR, "term2"),
@@ -108,21 +140,24 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
EqualsToken(Token::Type::REGULAR, "term3"),
EqualsToken(Token::Type::REGULAR, "term4"),
EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::REGULAR, "term1"),
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("term1(term2)"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
-
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)term2"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("(term1)term2"));
+ EXPECT_THAT(query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2")));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
IsOkAndHolds(ElementsAre(
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index cb474c6..e5de6e6 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -43,46 +43,38 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
- while (true) {
- // Prerequisite check
- if (IsDone()) {
- return false;
- }
-
- if (term_end_exclusive_.utf16_index() == 0) {
- int first = break_iterator_->First();
- if (!term_start_.MoveToUtf16(first)) {
- // First is guaranteed to succeed and return a position within bonds.
- // So the only possible failure could be an invalid sequence. Mark as
- // DONE and return.
- MarkAsDone();
- return false;
- }
- } else {
- term_start_ = term_end_exclusive_;
- }
+ // Prerequisite check
+ if (IsDone()) {
+ return false;
+ }
- int next_utf16_index_exclusive = break_iterator_->Next();
- // Reached the end
- if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
- MarkAsDone();
- return false;
- }
- if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
- // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
- // the check for kDone above. So the only possible failure could be an
- // invalid sequence. Mark as DONE and return.
+ if (term_end_exclusive_.utf16_index() == 0) {
+ int first = break_iterator_->First();
+ if (!term_start_.MoveToUtf16(first)) {
+ // First is guaranteed to succeed and return a position within bonds.
+ // So the only possible failure could be an invalid sequence. Mark as
+ // DONE and return.
MarkAsDone();
return false;
}
+ } else {
+ term_start_ = term_end_exclusive_;
+ }
- // Check if the current term is valid. We consider any term valid if its
- // first character is valid. If it's not valid, then we need to advance to
- // the next term.
- if (IsValidTerm()) {
- return true;
- }
+ int next_utf16_index_exclusive = break_iterator_->Next();
+ // Reached the end
+ if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
+ MarkAsDone();
+ return false;
}
+ if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
+ // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
+ // the check for kDone above. So the only possible failure could be an
+ // invalid sequence. Mark as DONE and return.
+ MarkAsDone();
+ return false;
+ }
+ return true;
}
// Returns the current term. It can be called only when Advance() returns
@@ -245,7 +237,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// 4. The start and end indices point to a segment, but we need to ensure
// that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
// need a segment prior to this one.
- if (term_end_exclusive_.utf32_index() > offset || !IsValidTerm()) {
+ if (term_end_exclusive_.utf32_index() > offset) {
return ResetToTermEndingBeforeUtf32(term_start_.utf32_index());
}
return term_start_.utf32_index();
@@ -285,24 +277,6 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone;
}
- bool IsValidTerm() const {
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) {
- return true;
- }
-
- UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
- term_start_.utf8_index());
- // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned.
- // We know it's an alphanumeric term by checking the first unicode
- // character.
- if (i18n_utils::IsAlphaNumeric(uchar32)) {
- return true;
- }
- return false;
- }
-
// All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
// this class needs to maintain state to convert between UTF-16 and UTF-8.
std::unique_ptr<ReverseJniBreakIterator> break_iterator_;
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 45d6475..277ece6 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -185,7 +185,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) {
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("γ€‚οΌŸΒ·Hello!×"),
- IsOkAndHolds(ElementsAre("Hello")));
+ IsOkAndHolds(ElementsAre("。", "?", "Β·", "Hello", "!", "Γ—")));
}
TEST_P(ReverseJniLanguageSegmenterTest, Acronym) {
@@ -246,9 +246,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) {
// Connectors don't connect if one side is an invalid term (?)
EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"),
- IsOkAndHolds(ElementsAre("bar:baz", ":")));
+ IsOkAndHolds(ElementsAre("bar:baz", ":", "?")));
EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"),
- IsOkAndHolds(ElementsAre(":", "bar:baz")));
+ IsOkAndHolds(ElementsAre("?", ":", "bar:baz")));
EXPECT_THAT(language_segmenter->GetAllTerms("3:14"),
IsOkAndHolds(ElementsAre("3", ":", "14")));
EXPECT_THAT(language_segmenter->GetAllTerms("私:は"),
@@ -413,15 +413,17 @@ TEST_P(ReverseJniLanguageSegmenterTest, CJKT) {
// have whitespaces as word delimiter.
// Chinese
- EXPECT_THAT(language_segmenter->GetAllTerms("ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"),
- IsOkAndHolds(ElementsAre("ζˆ‘", "每倩", "θ΅°θ·―", "去", "上班")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"),
+ IsOkAndHolds(ElementsAre("ζˆ‘", "每倩", "θ΅°θ·―", "去", "上班", "。")));
// Japanese
EXPECT_THAT(language_segmenter->GetAllTerms("私は毎ζ—₯仕事に歩いています。"),
IsOkAndHolds(ElementsAre("私", "は", "毎ζ—₯", "δ»•δΊ‹", "に", "ζ­©",
- "い", "てい", "ます")));
+ "い", "てい", "ます", "。")));
// Khmer
EXPECT_THAT(language_segmenter->GetAllTerms("αž‰αž»αŸ†αžŠαžΎαžšαž‘αŸ…αž’αŸ’αžœαžΎαž€αžΆαžšαžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒαŸ”"),
- IsOkAndHolds(ElementsAre("αž‰αž»αŸ†", "αžŠαžΎαžšαž‘αŸ…", "αž’αŸ’αžœαžΎαž€αžΆαžš", "αžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒ")));
+ IsOkAndHolds(ElementsAre("αž‰αž»αŸ†", "αžŠαžΎαžšαž‘αŸ…", "αž’αŸ’αžœαžΎαž€αžΆαžš", "αžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒ", "αŸ”")));
+
// Thai
EXPECT_THAT(
language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
@@ -852,16 +854,19 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
// String: "ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"
- // ^ ^ ^ ^^
- // UTF-8 idx: 0 3 9 15 18
- // UTF-832 idx: 0 1 3 5 6
+ // ^ ^ ^ ^^ ^
+ // UTF-8 idx: 0 3 9 15 18 24
+ // UTF-832 idx: 0 1 3 5 6 8
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每倩"));
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("θ΅°θ·―"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
@@ -876,18 +881,21 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
// String: "私は毎ζ—₯仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // UTF-8 idx: 0 3 6 12 18212427 33
- // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33 39
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("δ»•δΊ‹"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
}
TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
@@ -899,13 +907,16 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
// String: "αž‰αž»αŸ†αžŠαžΎαžšαž‘αŸ…αž’αŸ’αžœαžΎαž€αžΆαžšαžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒαŸ”"
- // ^ ^ ^ ^
- // UTF-8 idx: 0 9 24 45
- // UTF-32 idx: 0 3 8 15
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45 69
+ // UTF-32 idx: 0 3 8 15 23
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("αžŠαžΎαžšαž‘αŸ…"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("αŸ”"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 2bc18cc..24f8269 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -40,15 +40,6 @@ class Tokenizer {
public:
virtual ~Tokenizer() = default;
- enum Type {
- // Index tokenizers
- PLAIN, // Used to tokenize plain text input
- VERBATIM, // Used to tokenize the input text in verbatim
-
- // Query tokenizers
- RAW_QUERY, // Used to tokenize raw queries
- };
-
// An iterator helping to get tokens.
// Example usage:
//