aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2022-05-20 00:34:33 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2022-05-20 00:34:33 +0000
commitd31dbff26dca4d83e2ed4378bab786aedff3afb9 (patch)
tree8065624dfae73db2a4289699d5a0253b0797c7ea
parentc4647e82f4a65ae9de994b7db2e1ca8052b88b62 (diff)
parentbdb68ac2f660c1c8fbfea44664d4a19f829b5694 (diff)
downloadicing-d31dbff26dca4d83e2ed4378bab786aedff3afb9.tar.gz
Snap for 8618812 from bdb68ac2f660c1c8fbfea44664d4a19f829b5694 to mainline-tzdata4-release
Change-Id: I6f3305e86cbf80152e703cdc01fc275b0f9dda02
-rw-r--r--icing/file/file-backed-vector_test.cc56
-rw-r--r--icing/file/memory-mapped-file.cc18
-rw-r--r--icing/icing-search-engine_test.cc135
-rw-r--r--icing/schema/schema-store_test.cc176
-rw-r--r--icing/schema/schema-util.cc64
-rw-r--r--icing/tokenization/combined-tokenizer_test.cc232
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc59
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc48
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc84
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc89
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc80
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc49
-rw-r--r--icing/tokenization/tokenizer.h9
13 files changed, 856 insertions, 243 deletions
diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc
index 7c02af9..54f9ef5 100644
--- a/icing/file/file-backed-vector_test.cc
+++ b/icing/file/file-backed-vector_test.cc
@@ -14,6 +14,8 @@
#include "icing/file/file-backed-vector.h"
+#include <unistd.h>
+
#include <algorithm>
#include <cerrno>
#include <cstdint>
@@ -21,18 +23,21 @@
#include <string_view>
#include <vector>
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/file/filesystem.h"
-#include "icing/file/memory-mapped-file.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/tmp-directory.h"
-#include "icing/util/crc32.h"
-#include "icing/util/logging.h"
+#include "knowledge/cerebra/sense/text_classifier/lib3/utils/base/status.h"
+#include "testing/base/public/gmock.h"
+#include "testing/base/public/gunit.h"
+#include "third_party/icing/file/filesystem.h"
+#include "third_party/icing/file/memory-mapped-file.h"
+#include "third_party/icing/file/mock-filesystem.h"
+#include "third_party/icing/testing/common-matchers.h"
+#include "third_party/icing/testing/tmp-directory.h"
+#include "third_party/icing/util/crc32.h"
+#include "third_party/icing/util/logging.h"
using ::testing::Eq;
using ::testing::IsTrue;
using ::testing::Pointee;
+using ::testing::Return;
namespace icing {
namespace lib {
@@ -73,6 +78,8 @@ class FileBackedVectorTest : public testing::Test {
return std::string_view(vector->array() + idx, expected_len);
}
+ const Filesystem& filesystem() const { return filesystem_; }
+
Filesystem filesystem_;
std::string file_path_;
int fd_;
@@ -637,6 +644,39 @@ TEST_F(FileBackedVectorTest, InitNormalSucceeds) {
}
}
+TEST_F(FileBackedVectorTest, RemapFailureStillValidInstance) {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ *mock_filesystem, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // 1. Write data to just before the first block resize. Running the test
+ // locally has determined that we'll first resize at 65531st entry.
+ constexpr int kResizingIndex = 16378;
+ for (int i = 0; i < kResizingIndex; ++i) {
+ ICING_ASSERT_OK(vector->Set(i, 7));
+ }
+
+ // 2. The next Set call should cause a resize and a remap. Make that remap
+ // fail.
+ int num_calls = 0;
+ auto open_lambda = [this, &num_calls](const char* file_name){
+ if (++num_calls == 2) {
+ return -1;
+ }
+ return this->filesystem().OpenForWrite(file_name);
+ };
+ ON_CALL(*mock_filesystem, OpenForWrite(_)).WillByDefault(open_lambda);
+ EXPECT_THAT(vector->Set(kResizingIndex, 7),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+
+ // 3. We should still be able to call set correctly for earlier regions.
+ ICING_EXPECT_OK(vector->Set(kResizingIndex / 2, 9));
+ EXPECT_THAT(vector->Get(kResizingIndex / 2), IsOkAndHolds(Pointee(Eq(9))));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/file/memory-mapped-file.cc b/icing/file/memory-mapped-file.cc
index bda01f2..9ff3adb 100644
--- a/icing/file/memory-mapped-file.cc
+++ b/icing/file/memory-mapped-file.cc
@@ -70,10 +70,10 @@ void MemoryMappedFile::MemoryMappedFile::Unmap() {
libtextclassifier3::Status MemoryMappedFile::Remap(size_t file_offset,
size_t mmap_size) {
- // First unmap any previously mmapped region.
- Unmap();
-
if (mmap_size == 0) {
+ // First unmap any previously mmapped region.
+ Unmap();
+
// Nothing more to do.
return libtextclassifier3::Status::OK;
}
@@ -118,15 +118,19 @@ libtextclassifier3::Status MemoryMappedFile::Remap(size_t file_offset,
"Unable to open file meant to be mmapped: ", file_path_));
}
- mmap_result_ = mmap(nullptr, adjusted_mmap_size, protection_flags, mmap_flags,
- fd.get(), aligned_offset);
+ void* mmap_result = mmap(nullptr, adjusted_mmap_size, protection_flags,
+ mmap_flags, fd.get(), aligned_offset);
- if (mmap_result_ == MAP_FAILED) {
- mmap_result_ = nullptr;
+ if (mmap_result == MAP_FAILED) {
return absl_ports::InternalError(absl_ports::StrCat(
"Failed to mmap region due to error: ", strerror(errno)));
}
+ // Now we know that we have successfully created a new mapping. We can free
+ // the old one and switch to the new one.
+ Unmap();
+
+ mmap_result_ = mmap_result;
file_offset_ = file_offset;
region_ = reinterpret_cast<char*>(mmap_result_) + alignment_adjustment;
region_size_ = mmap_size;
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index 7ed8885..5244f4c 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -8082,6 +8082,141 @@ TEST_F(IcingSearchEngineTest, CJKSnippetTest) {
EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
}
+TEST_F(IcingSearchEngineTest, InvalidToEmptyQueryTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // String: "Luca Brasi sleeps with the 🐟🐟🐟."
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF8 idx: 0 5 11 18 23 27 3135 39
+ // UTF16 idx: 0 5 11 18 23 27 2931 33
+ // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟"
+ // and "🐟".
+ constexpr std::string_view kSicilianMessage =
+ "Luca Brasi sleeps with the 🐟🐟🐟.";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kSicilianMessage)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "Some other content.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto search_spec;
+ search_spec.set_query("?");
+ search_spec.set_term_match_type(MATCH_PREFIX);
+ ScoringSpecProto scoring_spec;
+ ResultSpecProto result_spec;
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query("。");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query("-");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query(":");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query("OR");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+
+ search_spec.set_query(" ");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+}
+
+TEST_F(IcingSearchEngineTest, EmojiSnippetTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // String: "Luca Brasi sleeps with the 🐟🐟🐟."
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF8 idx: 0 5 11 18 23 27 3135 39
+ // UTF16 idx: 0 5 11 18 23 27 2931 33
+ // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟"
+ // and "🐟".
+ constexpr std::string_view kSicilianMessage =
+ "Luca Brasi sleeps with the 🐟🐟🐟.";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kSicilianMessage)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "Some other content.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto search_spec;
+ search_spec.set_query("🐟");
+ search_spec.set_term_match_type(MATCH_PREFIX);
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results = icing.Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ ASSERT_THAT(search_results.status(), ProtoIsOk());
+ ASSERT_THAT(search_results.results(), SizeIs(1));
+ const SearchResultProto::ResultProto* result = &search_results.results(0);
+ EXPECT_THAT(result->document().uri(), Eq("uri1"));
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(result->snippet().entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("body"));
+
+ // Get the content for "subject" and see what the match is.
+ std::string_view content = GetString(&result->document(), "body");
+ ASSERT_THAT(content, Eq(kSicilianMessage));
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(27));
+ EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(4));
+ std::string_view match =
+ content.substr(match_proto.exact_match_byte_position(),
+ match_proto.exact_match_byte_length());
+ ASSERT_THAT(match, Eq("🐟"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(27));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
TEST_F(IcingSearchEngineTest, PutDocumentIndexFailureDeletion) {
IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc
index 113084e..541918f 100644
--- a/icing/schema/schema-store_test.cc
+++ b/icing/schema/schema-store_test.cc
@@ -57,6 +57,7 @@ constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
StringIndexingConfig::TokenizerType::PLAIN;
constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
PropertyConfigProto::DataType::STRING;
@@ -678,6 +679,181 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
+TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleNestedTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+
+ // 1. Create a ContactPoint type with a repeated property and set that schema
+ SchemaTypeConfigBuilder contact_point_repeated_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto old_schema =
+ SchemaBuilder().AddType(contact_point_repeated_label).Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_contact_point_type_id,
+ schema_store->GetSchemaTypeId("ContactPoint"));
+
+ // 2. Create a type that references the ContactPoint type and make a backwards
+ // incompatible change to ContactPoint
+ SchemaTypeConfigBuilder contact_point_optional_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL));
+ SchemaTypeConfigBuilder person =
+ SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto new_schema = SchemaBuilder()
+ .AddType(contact_point_optional_label)
+ .AddType(person)
+ .Build();
+
+ // 3. SetSchema should fail with ignore_errors_and_delete_documents=false and
+ // the old schema should remain
+ SchemaStore::SetSchemaResult expected_result;
+ expected_result.success = false;
+ expected_result.schema_types_incompatible_by_name.insert("ContactPoint");
+ expected_result.schema_types_incompatible_by_id.insert(
+ old_contact_point_type_id);
+ expected_result.schema_types_new_by_name.insert("Person");
+ EXPECT_THAT(
+ schema_store->SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(old_schema));
+
+ // 4. SetSchema should succeed with ignore_errors_and_delete_documents=true
+ // and the new schema should be set
+ expected_result.success = true;
+ EXPECT_THAT(
+ schema_store->SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/true),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithIndexIncompatibleNestedTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+
+ // 1. Create a ContactPoint type with label that matches prefix and set that
+ // schema
+ SchemaTypeConfigBuilder contact_point_prefix_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto old_schema =
+ SchemaBuilder().AddType(contact_point_prefix_label).Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
+
+ // 2. Create a type that references the ContactPoint type and make a index
+ // backwards incompatible change to ContactPoint
+ SchemaTypeConfigBuilder contact_point_exact_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaTypeConfigBuilder person =
+ SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto new_schema = SchemaBuilder()
+ .AddType(contact_point_exact_label)
+ .AddType(person)
+ .Build();
+
+ // SetSchema should succeed, and only ContactPoint should be in
+ // schema_types_index_incompatible_by_name.
+ SchemaStore::SetSchemaResult expected_result;
+ expected_result.success = true;
+ expected_result.schema_types_index_incompatible_by_name.insert(
+ "ContactPoint");
+ expected_result.schema_types_new_by_name.insert("Person");
+ EXPECT_THAT(
+ schema_store->SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+
+ // 1. Create a ContactPoint type with a optional property and set that schema
+ SchemaTypeConfigBuilder contact_point_optional_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL));
+ SchemaProto old_schema =
+ SchemaBuilder().AddType(contact_point_optional_label).Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(old_schema));
+
+ // 2. Create a type that references the ContactPoint type and make a backwards
+ // compatible change to ContactPoint
+ SchemaTypeConfigBuilder contact_point_repeated_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaTypeConfigBuilder person =
+ SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto new_schema = SchemaBuilder()
+ .AddType(contact_point_repeated_label)
+ .AddType(person)
+ .Build();
+
+ // 3. SetSchema should succeed, and only ContactPoint should be in
+ // schema_types_changed_fully_compatible_by_name.
+ SchemaStore::SetSchemaResult expected_result;
+ expected_result.success = true;
+ expected_result.schema_types_changed_fully_compatible_by_name.insert(
+ "ContactPoint");
+ expected_result.schema_types_new_by_name.insert("Person");
+ EXPECT_THAT(schema_store->SetSchema(
+ new_schema, /*ignore_errors_and_delete_documents=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
+}
+
TEST_F(SchemaStoreTest, GetSchemaTypeId) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc
index 22bc3f6..88b6946 100644
--- a/icing/schema/schema-util.cc
+++ b/icing/schema/schema-util.cc
@@ -107,6 +107,33 @@ bool IsTermMatchTypeCompatible(const StringIndexingConfig& old_indexed,
old_indexed.tokenizer_type() == new_indexed.tokenizer_type();
}
+void AddIncompatibleChangeToDelta(
+ std::unordered_set<std::string>& incompatible_delta,
+ const SchemaTypeConfigProto& old_type_config,
+ const SchemaUtil::DependencyMap& new_schema_dependency_map,
+ const SchemaUtil::TypeConfigMap& old_type_config_map,
+ const SchemaUtil::TypeConfigMap& new_type_config_map) {
+ // If this type is incompatible, then every type that depends on it might
+ // also be incompatible. Use the dependency map to mark those ones as
+ // incompatible too.
+ incompatible_delta.insert(old_type_config.schema_type());
+ auto parent_types_itr =
+ new_schema_dependency_map.find(old_type_config.schema_type());
+ if (parent_types_itr != new_schema_dependency_map.end()) {
+ for (std::string_view parent_type : parent_types_itr->second) {
+ // The types from new_schema that depend on the current
+ // old_type_config may not present in old_schema.
+ // Those types will be listed at schema_delta.schema_types_new
+ // instead.
+ std::string parent_type_str(parent_type);
+ if (old_type_config_map.find(parent_type_str) !=
+ old_type_config_map.end()) {
+ incompatible_delta.insert(std::move(parent_type_str));
+ }
+ }
+ }
+}
+
} // namespace
libtextclassifier3::Status ExpandTranstiveDependencies(
@@ -447,7 +474,8 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
const DependencyMap& new_schema_dependency_map) {
SchemaDelta schema_delta;
- TypeConfigMap new_type_config_map;
+ TypeConfigMap old_type_config_map, new_type_config_map;
+ BuildTypeConfigMap(old_schema, &old_type_config_map);
BuildTypeConfigMap(new_schema, &new_type_config_map);
// Iterate through and check each field of the old schema
@@ -566,37 +594,15 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
}
if (is_incompatible) {
- // If this type is incompatible, then every type that depends on it might
- // also be incompatible. Use the dependency map to mark those ones as
- // incompatible too.
- schema_delta.schema_types_incompatible.insert(
- old_type_config.schema_type());
- auto parent_types_itr =
- new_schema_dependency_map.find(old_type_config.schema_type());
- if (parent_types_itr != new_schema_dependency_map.end()) {
- schema_delta.schema_types_incompatible.reserve(
- schema_delta.schema_types_incompatible.size() +
- parent_types_itr->second.size());
- schema_delta.schema_types_incompatible.insert(
- parent_types_itr->second.begin(), parent_types_itr->second.end());
- }
+ AddIncompatibleChangeToDelta(schema_delta.schema_types_incompatible,
+ old_type_config, new_schema_dependency_map,
+ old_type_config_map, new_type_config_map);
}
if (is_index_incompatible) {
- // If this type is index incompatible, then every type that depends on it
- // might also be index incompatible. Use the dependency map to mark those
- // ones as index incompatible too.
- schema_delta.schema_types_index_incompatible.insert(
- old_type_config.schema_type());
- auto parent_types_itr =
- new_schema_dependency_map.find(old_type_config.schema_type());
- if (parent_types_itr != new_schema_dependency_map.end()) {
- schema_delta.schema_types_index_incompatible.reserve(
- schema_delta.schema_types_index_incompatible.size() +
- parent_types_itr->second.size());
- schema_delta.schema_types_index_incompatible.insert(
- parent_types_itr->second.begin(), parent_types_itr->second.end());
- }
+ AddIncompatibleChangeToDelta(schema_delta.schema_types_index_incompatible,
+ old_type_config, new_schema_dependency_map,
+ old_type_config_map, new_type_config_map);
}
if (!is_incompatible && !is_index_incompatible && has_property_changed) {
diff --git a/icing/tokenization/combined-tokenizer_test.cc b/icing/tokenization/combined-tokenizer_test.cc
new file mode 100644
index 0000000..0212e4f
--- /dev/null
+++ b/icing/tokenization/combined-tokenizer_test.cc
@@ -0,0 +1,232 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string_view>
+#include <vector>
+
+#include "testing/base/public/gmock.h"
+#include "testing/base/public/gunit.h"
+#include "third_party/icing/portable/platform.h"
+#include "third_party/icing/proto/schema_proto_portable.pb.h"
+#include "third_party/icing/testing/common-matchers.h"
+#include "third_party/icing/testing/icu-data-file-helper.h"
+#include "third_party/icing/testing/jni-test-helpers.h"
+#include "third_party/icing/testing/test-data.h"
+#include "third_party/icing/tokenization/language-segmenter-factory.h"
+#include "third_party/icing/tokenization/language-segmenter.h"
+#include "third_party/icing/tokenization/tokenizer-factory.h"
+#include "third_party/icing/tokenization/tokenizer.h"
+#include "third_party/icu/include/unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+
+// This test exists to ensure that the different tokenizers treat different
+// segments of text in the same manner.
+class CombinedTokenizerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //third_party/icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("third_party/icing/icu.dat")));
+ }
+ jni_cache_ = GetTestJniCache();
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+ }
+
+ std::unique_ptr<const JniCache> jni_cache_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+};
+
+std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
+ std::vector<std::string> terms;
+ terms.reserve(tokens.size());
+ for (const Token& token : tokens) {
+ if (token.type == Token::Type::REGULAR) {
+ terms.push_back(std::string(token.text));
+ }
+ }
+ return terms;
+}
+
+} // namespace
+
+TEST_F(CombinedTokenizerTest, SpecialCharacters) {
+ const std::string_view kText = "😊 Hello! Goodbye?";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("😊", "Hello", "Goodbye"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("😊", "Hello", "Goodbye"));
+}
+
+TEST_F(CombinedTokenizerTest, Parentheses) {
+ const std::string_view kText = "((paren1)(paren2) (last paren))";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("paren1", "paren2", "last", "paren"));
+}
+
+TEST_F(CombinedTokenizerTest, Negation) {
+ const std::string_view kText = "-foo -bar -baz";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
+}
+
+TEST_F(CombinedTokenizerTest, Colons) {
+ const std::string_view kText = ":foo: :bar baz:";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
+}
+
+TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ // This is a difference between the two tokenizers. "foo:bar" is a single
+ // token to the plain tokenizer because ':' is a word connector. But "foo:bar"
+ // is a property restrict to the query tokenizer - so "foo" is the property
+ // and "bar" is the only text term.
+ constexpr std::string_view kText = "foo:bar";
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("bar"));
+
+ // This difference, however, should only apply to the first ':'. A
+ // second ':' should be treated by both tokenizers as a word connector.
+ constexpr std::string_view kText2 = "foo:bar:baz";
+ ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText2));
+ indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ query_tokenizer->TokenizeAll(kText2));
+ query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("bar:baz"));
+}
+
+TEST_F(CombinedTokenizerTest, Punctuation) {
+ const std::string_view kText = "Who? What!? Why & How.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("Who", "What", "Why", "How"));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index 8e0f789..dc7b0a4 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -64,30 +64,26 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
- while (true) {
- // Prerequisite check
- if (term_end_index_exclusive_ == UBRK_DONE) {
- return false;
- }
-
- if (term_end_index_exclusive_ == 0) {
- // First Advance() call
- term_start_index_ = ubrk_first(break_iterator_);
- } else {
- term_start_index_ = term_end_index_exclusive_;
- }
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
+ // Prerequisite check
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ return false;
+ }
- // Reached the end
- if (term_end_index_exclusive_ == UBRK_DONE) {
- MarkAsDone();
- return false;
- }
+ if (term_end_index_exclusive_ == 0) {
+ // First Advance() call
+ term_start_index_ = ubrk_first(break_iterator_);
+ } else {
+ term_start_index_ = term_end_index_exclusive_;
+ }
+ term_end_index_exclusive_ = ubrk_next(break_iterator_);
- if (IsValidSegment()) {
- return true;
- }
+ // Reached the end
+ if (term_end_index_exclusive_ == UBRK_DONE) {
+ MarkAsDone();
+ return false;
}
+
+ return true;
}
// Returns the current term. It can be called only when Advance() returns
@@ -227,8 +223,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return absl_ports::AbortedError(
"Could not retrieve valid utf8 character!");
}
- if (term_end_index_exclusive_ > offset_iterator_.utf8_index() ||
- !IsValidSegment()) {
+ if (term_end_index_exclusive_ > offset_iterator_.utf8_index()) {
return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
}
return term_start_iterator.utf32_index();
@@ -295,24 +290,6 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
term_start_index_ = 0;
}
- bool IsValidSegment() const {
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[term_start_index_])) {
- return true;
- }
-
- UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
- term_start_index_);
- // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned.
- // We know it's an alphanumeric term by checking the first unicode
- // character.
- if (i18n_utils::IsAlphaNumeric(uchar32)) {
- return true;
- }
- return false;
- }
-
// The underlying class that does the segmentation, ubrk_close() must be
// called after using.
UBreakIterator* break_iterator_;
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index fe0b96e..4098be5 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -191,7 +191,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("γ€‚οΌŸΒ·Hello!×"),
- IsOkAndHolds(ElementsAre("Hello")));
+ IsOkAndHolds(ElementsAre("。", "?", "Β·", "Hello", "!", "Γ—")));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
@@ -252,9 +252,9 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
// Connectors don't connect if one side is an invalid term (?)
EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"),
- IsOkAndHolds(ElementsAre("bar:baz", ":")));
+ IsOkAndHolds(ElementsAre("bar:baz", ":", "?")));
EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"),
- IsOkAndHolds(ElementsAre(":", "bar:baz")));
+ IsOkAndHolds(ElementsAre("?", ":", "bar:baz")));
EXPECT_THAT(language_segmenter->GetAllTerms("3:14"),
IsOkAndHolds(ElementsAre("3", ":", "14")));
EXPECT_THAT(language_segmenter->GetAllTerms("私:は"),
@@ -417,15 +417,16 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
// have whitespaces as word delimiter.
// Chinese
- EXPECT_THAT(language_segmenter->GetAllTerms("ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"),
- IsOkAndHolds(ElementsAre("ζˆ‘", "每倩", "θ΅°θ·―", "去", "上班")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"),
+ IsOkAndHolds(ElementsAre("ζˆ‘", "每倩", "θ΅°θ·―", "去", "上班", "。")));
// Japanese
EXPECT_THAT(language_segmenter->GetAllTerms("私は毎ζ—₯仕事に歩いています。"),
IsOkAndHolds(ElementsAre("私", "は", "毎ζ—₯", "δ»•δΊ‹", "に", "ζ­©",
- "い", "てい", "ます")));
+ "い", "てい", "ます", "。")));
// Khmer
EXPECT_THAT(language_segmenter->GetAllTerms("αž‰αž»αŸ†αžŠαžΎαžšαž‘αŸ…αž’αŸ’αžœαžΎαž€αžΆαžšαžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒαŸ”"),
- IsOkAndHolds(ElementsAre("αž‰αž»αŸ†", "αžŠαžΎαžšαž‘αŸ…", "αž’αŸ’αžœαžΎαž€αžΆαžš", "αžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒ")));
+ IsOkAndHolds(ElementsAre("αž‰αž»αŸ†", "αžŠαžΎαžšαž‘αŸ…", "αž’αŸ’αžœαžΎαž€αžΆαžš", "αžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒ", "αŸ”")));
// Thai
EXPECT_THAT(
language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
@@ -858,16 +859,19 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
// String: "ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"
- // ^ ^ ^ ^^
- // UTF-8 idx: 0 3 9 15 18
- // UTF-832 idx: 0 1 3 5 6
+ // ^ ^ ^ ^^ ^
+ // UTF-8 idx: 0 3 9 15 18 24
+ // UTF-832 idx: 0 1 3 5 6 8
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每倩"));
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("θ΅°θ·―"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
@@ -882,18 +886,21 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
// String: "私は毎ζ—₯仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // UTF-8 idx: 0 3 6 12 18212427 33
- // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33 39
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("δ»•δΊ‹"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
@@ -905,13 +912,16 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
// String: "αž‰αž»αŸ†αžŠαžΎαžšαž‘αŸ…αž’αŸ’αžœαžΎαž€αžΆαžšαžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒαŸ”"
- // ^ ^ ^ ^
- // UTF-8 idx: 0 9 24 45
- // UTF-32 idx: 0 3 8 15
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45 69
+ // UTF-32 idx: 0 3 8 15 23
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("αžŠαžΎαžšαž‘αŸ…"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("αŸ”"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 8a27103..ff449a7 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -102,7 +102,7 @@ enum State {
// When seeing right parentheses
CLOSING_PARENTHESES = 8,
- PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9,
PROCESSING_PROPERTY_TERM_APPENDING = 10,
@@ -119,7 +119,7 @@ enum TermType {
// A term that consists of unicode alphabetic and numeric characters
ASCII_ALPHANUMERIC_TERM = 1,
- NON_ASCII_ALPHABETIC_TERM = 2,
+ NON_ASCII_ALPHANUMERIC_TERM = 2,
// "("
LEFT_PARENTHESES = 3,
@@ -208,7 +208,7 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// PROCESSING_OR = 6
// OPENING_PARENTHESES = 7
// CLOSING_PARENTHESES = 8
-// PROCESSING_NON_ASCII_ALPHABETIC_TERM = 9
+// PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9
// PROCESSING_PROPERTY_TERM_APPENDING = 10
//
// Actions:
@@ -252,40 +252,40 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// like "+", "&", "@", "#" in indexing and query tokenizers.
constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: Ready*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
PROCESSING_OR, READY, READY},
/*State: PROCESSING_ALPHANUMERIC_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID,
PROCESSING_PROPERTY_RESTRICT, READY},
/*State: PROCESSING_EXCLUSION*/
{READY, PROCESSING_EXCLUSION_TERM, PROCESSING_EXCLUSION_TERM, INVALID,
CLOSING_PARENTHESES, PROCESSING_EXCLUSION, INVALID, INVALID, READY},
/*State: PROCESSING_EXCLUSION_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
/*State: PROCESSING_PROPERTY_RESTRICT*/
{READY, PROCESSING_PROPERTY_TERM, PROCESSING_PROPERTY_TERM, INVALID,
CLOSING_PARENTHESES, INVALID, INVALID, PROCESSING_PROPERTY_RESTRICT,
READY},
/*State: PROCESSING_PROPERTY_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID,
PROCESSING_PROPERTY_TERM_APPENDING, READY},
/*State: PROCESSING_OR*/
{READY, INVALID, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID,
INVALID, INVALID, READY},
/*State: OPENING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
OPENING_PARENTHESES, READY, READY},
/*State: CLOSING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, PROCESSING_EXCLUSION,
PROCESSING_OR, INVALID, READY},
- /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHABETIC_TERM,
+ /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM, PROCESSING_NON_ASCII_ALPHANUMERIC_TERM,
OPENING_PARENTHESES, CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
/*State: PROCESSING_PROPERTY_TERM_APPENDING*/
{READY, PROCESSING_PROPERTY_TERM_APPENDING,
@@ -326,7 +326,7 @@ constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: CLOSING_PARENTHESES*/
{OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT},
- /*State: PROCESSING_NON_ASCII_ALPHABETIC_TERM*/
+ /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/
{OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NON_ASCII_AS_PROPERTY_NAME, OUTPUT},
/*State: PROCESSING_PROPERTY_TERM_APPENDING*/
@@ -345,6 +345,40 @@ std::pair<TermType, std::string_view> GetWhitespaceTerm(std::string_view text,
return std::make_pair(WHITESPACE, text.substr(pos, cur - pos));
}
+TermType GetContentTermType(std::string_view text, size_t pos) {
+ if (i18n_utils::IsPunctuationAt(text, pos)) {
+ return OTHER;
+ } else if (i18n_utils::IsAscii(text[pos])) {
+ return ASCII_ALPHANUMERIC_TERM;
+ }
+ return NON_ASCII_ALPHANUMERIC_TERM;
+}
+
+bool IsContentTermType(TermType term_type) {
+ switch (term_type) {
+ case ASCII_ALPHANUMERIC_TERM:
+ [[fallthrough]];
+ case NON_ASCII_ALPHANUMERIC_TERM:
+ [[fallthrough]];
+ case OTHER:
+ return true;
+ case WHITESPACE:
+ [[fallthrough]];
+ case LEFT_PARENTHESES:
+ [[fallthrough]];
+ case RIGHT_PARENTHESES:
+ [[fallthrough]];
+ case EXCLUSION_OPERATOR:
+ [[fallthrough]];
+ case OR_OPERATOR:
+ [[fallthrough]];
+ case COLON:
+ [[fallthrough]];
+ case TYPE_COUNT:
+ return false;
+ }
+}
+
// Determines the length of the potential content term beginning at text[pos]
// and returns a pair with the appropriate TermType and a string_view of the
// content term.
@@ -357,12 +391,7 @@ std::pair<TermType, std::string_view> GetContentTerm(std::string_view text,
size_t pos) {
size_t len = 0;
// Checks the first char to see if it's an ASCII term
- TermType type = ASCII_ALPHANUMERIC_TERM;
- if (!i18n_utils::IsAscii(text[pos])) {
- type = NON_ASCII_ALPHABETIC_TERM;
- } else if (std::isalnum(text[pos])) {
- type = OTHER;
- }
+ TermType type = GetContentTermType(text, pos);
for (size_t cur = pos; cur < text.length() && len == 0; ++cur) {
switch (text[cur]) {
case kLeftParentheses:
@@ -470,7 +499,7 @@ libtextclassifier3::Status OutputToken(State new_state,
switch (current_term_type) {
case ASCII_ALPHANUMERIC_TERM:
[[fallthrough]];
- case NON_ASCII_ALPHABETIC_TERM:
+ case NON_ASCII_ALPHANUMERIC_TERM:
if (new_state == PROCESSING_PROPERTY_TERM) {
// Asserts extra rule 1: each property name in the property path is a
// valid term.
@@ -540,10 +569,8 @@ libtextclassifier3::Status ProcessTerm(
ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> content_terms,
language_segmenter->GetAllTerms(*current_term));
for (std::string_view term : content_terms) {
- TermType type = ASCII_ALPHANUMERIC_TERM;
- if (!i18n_utils::IsAscii(term[0])) {
- type = NON_ASCII_ALPHABETIC_TERM;
- } else if (!std::isalnum(term[0])) {
+ TermType type = GetContentTermType(term, 0);
+ if (type == OTHER) {
// Skip OTHER tokens here.
continue;
}
@@ -589,9 +616,7 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
for (int i = 0; i < prescanned_terms.size(); ++i) {
const std::pair<TermType, std::string_view>& prescanned_term =
prescanned_terms.at(i);
- if (prescanned_term.first != ASCII_ALPHANUMERIC_TERM &&
- prescanned_term.first != NON_ASCII_ALPHABETIC_TERM &&
- prescanned_term.first != OTHER) {
+ if (!IsContentTermType(prescanned_term.first)) {
// This can't be a property restrict. Just pass it in.
ICING_RETURN_IF_ERROR(
ProcessTerm(&current_state, &current_term, &current_term_type,
@@ -603,18 +628,15 @@ libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
std::vector<std::string_view> content_terms,
language_segmenter->GetAllTerms(prescanned_term.second));
for (std::string_view term : content_terms) {
- TermType type = ASCII_ALPHANUMERIC_TERM;
+ TermType type = GetContentTermType(term, 0);
if (term == kOrOperator) {
// TODO(tjbarron) Decide whether we should revise this and other
// handled syntax. This is used to allow queries like "term1,OR,term2"
// to succeed. It's not clear if we should allow this or require
// clients to ensure that OR operators are always surrounded by
// whitespace.
+ // Override the type if this is actually an OR operator.
type = OR_OPERATOR;
- } else if (!i18n_utils::IsAscii(term[0])) {
- type = NON_ASCII_ALPHABETIC_TERM;
- } else if (!std::isalnum(term[0])) {
- type = OTHER;
}
ICING_RETURN_IF_ERROR(ProcessTerm(&current_state, &current_term,
&current_term_type,
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index c6d981d..b1dcc73 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -70,6 +70,29 @@ TEST_F(RawQueryTokenizerTest, Simple) {
EqualsToken(Token::Type::REGULAR, "WORLD"))));
}
+TEST_F(RawQueryTokenizerTest, Emoji) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> raw_query_tokenizer,
+ tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
+ language_segmenter.get()));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("😊 Hello! Goodbye?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "😊"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "Goodbye"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("Hello😊 ! Goodbye?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "😊"),
+ EqualsToken(Token::Type::REGULAR, "Goodbye"))));
+}
+
TEST_F(RawQueryTokenizerTest, Parentheses) {
language_segmenter_factory::SegmenterOptions options(ULOC_US);
ICING_ASSERT_OK_AND_ASSIGN(
@@ -80,26 +103,35 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ raw_query_tokenizer->TokenizeAll("()"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("( )"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("(term1 term2)"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
EqualsToken(Token::Type::REGULAR, "term1"),
EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ ICING_ASSERT_OK_AND_ASSIGN(
+ query_tokens,
+ raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
EqualsToken(Token::Type::REGULAR, "term1"),
EqualsToken(Token::Type::REGULAR, "term2"),
@@ -108,21 +140,24 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
EqualsToken(Token::Type::REGULAR, "term3"),
EqualsToken(Token::Type::REGULAR, "term4"),
EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::REGULAR, "term1"),
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("term1(term2)"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
EqualsToken(Token::Type::REGULAR, "term2"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
-
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)term2"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term1"),
- EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::Type::REGULAR, "term2"))));
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("(term1)term2"));
+ EXPECT_THAT(query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2")));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
IsOkAndHolds(ElementsAre(
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index cb474c6..e5de6e6 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -43,46 +43,38 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
- while (true) {
- // Prerequisite check
- if (IsDone()) {
- return false;
- }
-
- if (term_end_exclusive_.utf16_index() == 0) {
- int first = break_iterator_->First();
- if (!term_start_.MoveToUtf16(first)) {
- // First is guaranteed to succeed and return a position within bonds.
- // So the only possible failure could be an invalid sequence. Mark as
- // DONE and return.
- MarkAsDone();
- return false;
- }
- } else {
- term_start_ = term_end_exclusive_;
- }
+ // Prerequisite check
+ if (IsDone()) {
+ return false;
+ }
- int next_utf16_index_exclusive = break_iterator_->Next();
- // Reached the end
- if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
- MarkAsDone();
- return false;
- }
- if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
- // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
- // the check for kDone above. So the only possible failure could be an
- // invalid sequence. Mark as DONE and return.
+ if (term_end_exclusive_.utf16_index() == 0) {
+ int first = break_iterator_->First();
+ if (!term_start_.MoveToUtf16(first)) {
+ // First is guaranteed to succeed and return a position within bonds.
+ // So the only possible failure could be an invalid sequence. Mark as
+ // DONE and return.
MarkAsDone();
return false;
}
+ } else {
+ term_start_ = term_end_exclusive_;
+ }
- // Check if the current term is valid. We consider any term valid if its
- // first character is valid. If it's not valid, then we need to advance to
- // the next term.
- if (IsValidTerm()) {
- return true;
- }
+ int next_utf16_index_exclusive = break_iterator_->Next();
+ // Reached the end
+ if (next_utf16_index_exclusive == ReverseJniBreakIterator::kDone) {
+ MarkAsDone();
+ return false;
}
+ if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
+ // next_utf16_index_exclusive is guaranteed to be within bonds thanks to
+ // the check for kDone above. So the only possible failure could be an
+ // invalid sequence. Mark as DONE and return.
+ MarkAsDone();
+ return false;
+ }
+ return true;
}
// Returns the current term. It can be called only when Advance() returns
@@ -245,7 +237,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// 4. The start and end indices point to a segment, but we need to ensure
// that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
// need a segment prior to this one.
- if (term_end_exclusive_.utf32_index() > offset || !IsValidTerm()) {
+ if (term_end_exclusive_.utf32_index() > offset) {
return ResetToTermEndingBeforeUtf32(term_start_.utf32_index());
}
return term_start_.utf32_index();
@@ -285,24 +277,6 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone;
}
- bool IsValidTerm() const {
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[term_start_.utf8_index()])) {
- return true;
- }
-
- UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
- term_start_.utf8_index());
- // Rule 2: for non-ASCII terms, only the alphanumeric terms are returned.
- // We know it's an alphanumeric term by checking the first unicode
- // character.
- if (i18n_utils::IsAlphaNumeric(uchar32)) {
- return true;
- }
- return false;
- }
-
// All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
// this class needs to maintain state to convert between UTF-16 and UTF-8.
std::unique_ptr<ReverseJniBreakIterator> break_iterator_;
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index 45d6475..277ece6 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -185,7 +185,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) {
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("γ€‚οΌŸΒ·Hello!×"),
- IsOkAndHolds(ElementsAre("Hello")));
+ IsOkAndHolds(ElementsAre("。", "?", "Β·", "Hello", "!", "Γ—")));
}
TEST_P(ReverseJniLanguageSegmenterTest, Acronym) {
@@ -246,9 +246,9 @@ TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) {
// Connectors don't connect if one side is an invalid term (?)
EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"),
- IsOkAndHolds(ElementsAre("bar:baz", ":")));
+ IsOkAndHolds(ElementsAre("bar:baz", ":", "?")));
EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"),
- IsOkAndHolds(ElementsAre(":", "bar:baz")));
+ IsOkAndHolds(ElementsAre("?", ":", "bar:baz")));
EXPECT_THAT(language_segmenter->GetAllTerms("3:14"),
IsOkAndHolds(ElementsAre("3", ":", "14")));
EXPECT_THAT(language_segmenter->GetAllTerms("私:は"),
@@ -413,15 +413,17 @@ TEST_P(ReverseJniLanguageSegmenterTest, CJKT) {
// have whitespaces as word delimiter.
// Chinese
- EXPECT_THAT(language_segmenter->GetAllTerms("ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"),
- IsOkAndHolds(ElementsAre("ζˆ‘", "每倩", "θ΅°θ·―", "去", "上班")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"),
+ IsOkAndHolds(ElementsAre("ζˆ‘", "每倩", "θ΅°θ·―", "去", "上班", "。")));
// Japanese
EXPECT_THAT(language_segmenter->GetAllTerms("私は毎ζ—₯仕事に歩いています。"),
IsOkAndHolds(ElementsAre("私", "は", "毎ζ—₯", "δ»•δΊ‹", "に", "ζ­©",
- "い", "てい", "ます")));
+ "い", "てい", "ます", "。")));
// Khmer
EXPECT_THAT(language_segmenter->GetAllTerms("αž‰αž»αŸ†αžŠαžΎαžšαž‘αŸ…αž’αŸ’αžœαžΎαž€αžΆαžšαžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒαŸ”"),
- IsOkAndHolds(ElementsAre("αž‰αž»αŸ†", "αžŠαžΎαžšαž‘αŸ…", "αž’αŸ’αžœαžΎαž€αžΆαžš", "αžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒ")));
+ IsOkAndHolds(ElementsAre("αž‰αž»αŸ†", "αžŠαžΎαžšαž‘αŸ…", "αž’αŸ’αžœαžΎαž€αžΆαžš", "αžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒ", "αŸ”")));
+
// Thai
EXPECT_THAT(
language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
@@ -852,16 +854,19 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
// String: "ζˆ‘ζ―ε€©θ΅°θ·―εŽ»δΈŠη­γ€‚"
- // ^ ^ ^ ^^
- // UTF-8 idx: 0 3 9 15 18
- // UTF-832 idx: 0 1 3 5 6
+ // ^ ^ ^ ^^ ^
+ // UTF-8 idx: 0 3 9 15 18 24
+ // UTF-832 idx: 0 1 3 5 6 8
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每倩"));
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("θ΅°θ·―"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
@@ -876,18 +881,21 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
// String: "私は毎ζ—₯仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // UTF-8 idx: 0 3 6 12 18212427 33
- // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33 39
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("δ»•δΊ‹"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
}
TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
@@ -899,13 +907,16 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
// String: "αž‰αž»αŸ†αžŠαžΎαžšαž‘αŸ…αž’αŸ’αžœαžΎαž€αžΆαžšαžšαžΆαž›αŸ‹αžαŸ’αž„αŸƒαŸ”"
- // ^ ^ ^ ^
- // UTF-8 idx: 0 9 24 45
- // UTF-32 idx: 0 3 8 15
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45 69
+ // UTF-32 idx: 0 3 8 15 23
EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("αžŠαžΎαžšαž‘αŸ…"));
- EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("αŸ”"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 2bc18cc..24f8269 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -40,15 +40,6 @@ class Tokenizer {
public:
virtual ~Tokenizer() = default;
- enum Type {
- // Index tokenizers
- PLAIN, // Used to tokenize plain text input
- VERBATIM, // Used to tokenize the input text in verbatim
-
- // Query tokenizers
- RAW_QUERY, // Used to tokenize raw queries
- };
-
// An iterator helping to get tokens.
// Example usage:
//