summaryrefslogtreecommitdiff
path: root/native/annotator
diff options
context:
space:
mode:
authorTony Mak <tonymak@google.com>2020-01-08 17:30:51 +0000
committerTony Mak <tonymak@google.com>2020-01-08 21:39:22 +0000
commit76d8096f5f552ed3c11131c650ce72dc68a2e254 (patch)
treee7edeb0a629741de52b5117c53597ac35296bdc4 /native/annotator
parent1bcd2f6cb74ad9fb0fa468619c2f00232b2075f3 (diff)
downloadlibtextclassifier-76d8096f5f552ed3c11131c650ce72dc68a2e254.tar.gz
Import libtextclassifier
Test: atest TextClassifierServiceTest Change-Id: If6e67d41838426b7ff3451fa74c71780b788568c
Diffstat (limited to 'native/annotator')
-rw-r--r--native/annotator/annotator.cc85
-rw-r--r--native/annotator/annotator.h7
-rw-r--r--native/annotator/annotator_jni.cc12
-rw-r--r--native/annotator/annotator_jni.h4
-rw-r--r--native/annotator/collections.h5
-rwxr-xr-xnative/annotator/entity-data.fbs14
-rwxr-xr-xnative/annotator/model.fbs14
-rw-r--r--native/annotator/person_name/person-name-engine-dummy.h56
-rw-r--r--native/annotator/person_name/person-name-engine.h22
-rwxr-xr-xnative/annotator/person_name/person_name_model.fbs40
-rw-r--r--native/annotator/test_data/test_model.fbbin657224 -> 682912 bytes
-rw-r--r--native/annotator/test_data/test_person_name_model.fbbin0 -> 64 bytes
-rw-r--r--native/annotator/test_data/wrong_embeddings.fbbin395504 -> 424444 bytes
13 files changed, 254 insertions, 5 deletions
diff --git a/native/annotator/annotator.cc b/native/annotator/annotator.cc
index abb57e8..8969026 100644
--- a/native/annotator/annotator.cc
+++ b/native/annotator/annotator.cc
@@ -61,6 +61,16 @@ const Model* LoadAndVerifyModel(const void* addr, int size) {
}
}
+const PersonNameModel* LoadAndVerifyPersonNameModel(const void* addr,
+ int size) {
+ flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);
+ if (VerifyPersonNameModelBuffer(verifier)) {
+ return GetPersonNameModel(addr);
+ } else {
+ return nullptr;
+ }
+}
+
// If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will
// create a new instance, assign ownership to owned_lib, and return it.
const UniLib* MaybeCreateUnilib(const UniLib* lib,
@@ -548,6 +558,37 @@ bool Annotator::InitializeInstalledAppEngine(
return true;
}
+bool Annotator::InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
+ int size) {
+ std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));
+
+ if (!mmap->handle().ok()) {
+ TC3_LOG(ERROR) << "Mmap for person name model failed.";
+ return false;
+ }
+
+ const PersonNameModel* person_name_model = LoadAndVerifyPersonNameModel(
+ mmap->handle().start(), mmap->handle().num_bytes());
+
+ if (person_name_model == nullptr) {
+ TC3_LOG(ERROR) << "Person name model verification failed.";
+ return false;
+ }
+
+ if (!person_name_model->enabled()) {
+ return true;
+ }
+
+ std::unique_ptr<PersonNameEngine> person_name_engine(
+ new PersonNameEngine(unilib_));
+ if (!person_name_engine->Initialize(person_name_model)) {
+ TC3_LOG(ERROR) << "Failed to initialize the person name engine.";
+ return false;
+ }
+ person_name_engine_ = std::move(person_name_engine);
+ return true;
+}
+
namespace {
int CountDigits(const std::string& str, CodepointSpan selection_indices) {
@@ -775,6 +816,11 @@ CodepointSpan Annotator::SuggestSelection(
TC3_LOG(ERROR) << "Duration annotator failed in suggest selection.";
return original_click_indices;
}
+ if (person_name_engine_ != nullptr &&
+ !person_name_engine_->Chunk(context_unicode, tokens, &candidates)) {
+ TC3_LOG(ERROR) << "Person name suggest selection failed.";
+ return original_click_indices;
+ }
// Sort candidates according to their position in the input, so that the next
// code can assume that any connected component of overlapping spans forms a
@@ -930,11 +976,13 @@ bool Annotator::ResolveConflict(
InterpreterManager* interpreter_manager,
std::vector<int>* chosen_indices) const {
std::vector<int> conflicting_indices;
- std::unordered_map<int, float> scores;
+ std::unordered_map<int, std::pair<float, int>> scores_lengths;
for (int i = start_index; i < end_index; ++i) {
conflicting_indices.push_back(i);
if (!candidates[i].classification.empty()) {
- scores[i] = GetPriorityScore(candidates[i].classification);
+ scores_lengths[i] = {
+ GetPriorityScore(candidates[i].classification),
+ candidates[i].span.second - candidates[i].span.first};
continue;
}
@@ -951,12 +999,23 @@ bool Annotator::ResolveConflict(
}
if (!classification.empty()) {
- scores[i] = GetPriorityScore(classification);
+ scores_lengths[i] = {
+ GetPriorityScore(classification),
+ candidates[i].span.second - candidates[i].span.first};
}
}
- std::sort(conflicting_indices.begin(), conflicting_indices.end(),
- [&scores](int i, int j) { return scores[i] > scores[j]; });
+ std::sort(
+ conflicting_indices.begin(), conflicting_indices.end(),
+ [&scores_lengths, candidates, conflicting_indices, this](int i, int j) {
+ if (scores_lengths[i].first == scores_lengths[j].first &&
+ this->model_->triggering_options() != nullptr &&
+ this->model_->triggering_options()
+ ->prioritize_longest_annotation()) {
+ return scores_lengths[i].second > scores_lengths[j].second;
+ }
+ return scores_lengths[i].first > scores_lengths[j].first;
+ });
// Here we keep a set of indices that were chosen, per-source, to enable
// effective computation.
@@ -1559,6 +1618,14 @@ std::vector<ClassificationResult> Annotator::ClassifyText(
candidates.push_back({selection_indices, {contact_result}});
}
+ // Try the person name engine.
+ ClassificationResult person_name_result;
+ if (person_name_engine_ &&
+ person_name_engine_->ClassifyText(context, selection_indices,
+ &person_name_result)) {
+ candidates.push_back({selection_indices, {person_name_result}});
+ }
+
// Try the installed app engine.
// TODO(b/126579108): Propagate error status.
ClassificationResult installed_app_result;
@@ -1923,6 +1990,14 @@ std::vector<AnnotatedSpan> Annotator::Annotate(
return {};
}
+ // Annotate with the person name engine.
+ if (is_entity_type_enabled(Collections::PersonName()) &&
+ person_name_engine_ &&
+ !person_name_engine_->Chunk(context_unicode, tokens, &candidates)) {
+ TC3_LOG(ERROR) << "Couldn't run person name engine Chunk.";
+ return {};
+ }
+
// Sort candidates according to their position in the input, so that the next
// code can assume that any connected component of overlapping spans forms a
// contiguous block.
diff --git a/native/annotator/annotator.h b/native/annotator/annotator.h
index dabd894..606d0bb 100644
--- a/native/annotator/annotator.h
+++ b/native/annotator/annotator.h
@@ -34,6 +34,7 @@
#include "annotator/model-executor.h"
#include "annotator/model_generated.h"
#include "annotator/number/number.h"
+#include "annotator/person_name/person-name-engine.h"
#include "annotator/strip-unpaired-brackets.h"
#include "annotator/types.h"
#include "annotator/zlib-utils.h"
@@ -226,6 +227,11 @@ class Annotator {
// Initializes the installed app engine with the given config.
bool InitializeInstalledAppEngine(const std::string& serialized_config);
+ // Initializes the person name engine with the given person name model in the
+ // provided file descriptor.
+ bool InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
+ int size);
+
// Runs inference for given a context and current selection (i.e. index
// of the first and one past last selected characters (utf8 codepoint
// offsets)). Returns the indices (utf8 codepoint offsets) of the selection
@@ -516,6 +522,7 @@ class Annotator {
std::unique_ptr<const InstalledAppEngine> installed_app_engine_;
std::unique_ptr<const NumberAnnotator> number_annotator_;
std::unique_ptr<const DurationAnnotator> duration_annotator_;
+ std::unique_ptr<const PersonNameEngine> person_name_engine_;
// Builder for creating extra data.
const reflection::Schema* entity_data_schema_;
diff --git a/native/annotator/annotator_jni.cc b/native/annotator/annotator_jni.cc
index 5c42c3e..e0c6262 100644
--- a/native/annotator/annotator_jni.cc
+++ b/native/annotator/annotator_jni.cc
@@ -560,6 +560,18 @@ TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME,
return model->InitializeInstalledAppEngine(serialized_config_string);
}
+TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME,
+ nativeInitializePersonNameEngine)
+(JNIEnv* env, jobject thiz, jlong ptr, jint fd, jlong offset, jlong size) {
+ if (!ptr) {
+ return false;
+ }
+
+ Annotator* model = reinterpret_cast<AnnotatorJniContext*>(ptr)->model();
+
+ return model->InitializePersonNameEngineFromFileDescriptor(fd, offset, size);
+}
+
TC3_JNI_METHOD(jlong, TC3_ANNOTATOR_CLASS_NAME, nativeGetNativeModelPtr)
(JNIEnv* env, jobject thiz, jlong ptr) {
if (!ptr) {
diff --git a/native/annotator/annotator_jni.h b/native/annotator/annotator_jni.h
index 0789e76..893f84e 100644
--- a/native/annotator/annotator_jni.h
+++ b/native/annotator/annotator_jni.h
@@ -49,6 +49,10 @@ TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME,
nativeInitializeInstalledAppEngine)
(JNIEnv* env, jobject thiz, jlong ptr, jbyteArray serialized_config);
+TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME,
+ nativeInitializePersonNameEngine)
+(JNIEnv* env, jobject thiz, jlong ptr, jint fd, jlong offset, jlong size);
+
TC3_JNI_METHOD(jlong, TC3_ANNOTATOR_CLASS_NAME, nativeGetNativeModelPtr)
(JNIEnv* env, jobject thiz, jlong ptr);
diff --git a/native/annotator/collections.h b/native/annotator/collections.h
index 0b75463..7783b41 100644
--- a/native/annotator/collections.h
+++ b/native/annotator/collections.h
@@ -114,6 +114,11 @@ class Collections {
*[]() { return new std::string("percentage"); }();
return value;
}
+ static const std::string& PersonName() {
+ static const std::string& value =
+ *[]() { return new std::string("person_name"); }();
+ return value;
+ }
static const std::string& Phone() {
static const std::string& value =
*[]() { return new std::string("phone"); }();
diff --git a/native/annotator/entity-data.fbs b/native/annotator/entity-data.fbs
index 0163bfd..59a213f 100755
--- a/native/annotator/entity-data.fbs
+++ b/native/annotator/entity-data.fbs
@@ -166,6 +166,19 @@ table ParcelTracking {
tracking_number:string (shared);
}
+// Parsed money amount.
+namespace libtextclassifier3.EntityData_;
+table Money {
+ // String representation of currency, unnormalized.
+ unnormalized_currency:string (shared);
+
+ // Whole part of the amount (e.g. 123 from "CHF 123.45").
+ amount_whole_part:int;
+
+ // Decimal part of the amount (e.g. 45 from "CHF 123.45").
+ amount_decimal_part:int;
+}
+
// Represents an entity annotated in text.
namespace libtextclassifier3;
table EntityData {
@@ -187,6 +200,7 @@ table EntityData {
isbn:EntityData_.Isbn;
iban:EntityData_.Iban;
parcel:EntityData_.ParcelTracking;
+ money:EntityData_.Money;
}
root_type libtextclassifier3.EntityData;
diff --git a/native/annotator/model.fbs b/native/annotator/model.fbs
index 6b7b007..e765ef6 100755
--- a/native/annotator/model.fbs
+++ b/native/annotator/model.fbs
@@ -338,6 +338,16 @@ table DatetimeModel {
prefer_future_for_unspecified_date:bool = false;
}
+namespace libtextclassifier3;
+table GrammarDatetimeModel {
+ // List of BCP 47 locale strings representing all locales supported by the
+ // model.
+ locales:[string];
+
+ // If true, will give only future dates (when the day is not specified).
+ prefer_future_for_unspecified_date:bool = false;
+}
+
namespace libtextclassifier3.DatetimeModelLibrary_;
table Item {
key:string (shared);
@@ -373,6 +383,9 @@ table ModelTriggeringOptions {
// Priority score assigned to knowledge engine annotations.
knowledge_priority_score:float = 0;
+
+ // If true, will prioritize the longest annotation during conflict resolution.
+ prioritize_longest_annotation:bool = false;
}
// Options controlling the output of the classifier.
@@ -468,6 +481,7 @@ table Model {
triggering_locales:string (shared);
embedding_pruning_mask:Model_.EmbeddingPruningMask;
+ grammar_datetime_model:GrammarDatetimeModel;
}
// Method for selecting the center token.
diff --git a/native/annotator/person_name/person-name-engine-dummy.h b/native/annotator/person_name/person-name-engine-dummy.h
new file mode 100644
index 0000000..91ae2e5
--- /dev/null
+++ b/native/annotator/person_name/person-name-engine-dummy.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_
+#define LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_
+
+#include <string>
+#include <vector>
+
+#include "annotator/feature-processor.h"
+#include "annotator/person_name/person_name_model_generated.h"
+#include "annotator/types.h"
+#include "utils/base/logging.h"
+#include "utils/utf8/unicodetext.h"
+#include "utils/utf8/unilib.h"
+
+namespace libtextclassifier3 {
+
+// A dummy implementation of the person name engine.
+class PersonNameEngine {
+ public:
+ explicit PersonNameEngine(const UniLib* unilib) {}
+
+ bool Initialize(const PersonNameModel* model) {
+ TC3_LOG(ERROR) << "No person name engine to initialize.";
+ return false;
+ }
+
+ bool ClassifyText(const std::string& context, CodepointSpan selection_indices,
+ ClassificationResult* classification_result) const {
+ return false;
+ }
+
+ bool Chunk(const UnicodeText& context_unicode,
+ const std::vector<Token>& tokens,
+ std::vector<AnnotatedSpan>* result) const {
+ return true;
+ }
+};
+
+} // namespace libtextclassifier3
+
+#endif // LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_
diff --git a/native/annotator/person_name/person-name-engine.h b/native/annotator/person_name/person-name-engine.h
new file mode 100644
index 0000000..988fce3
--- /dev/null
+++ b/native/annotator/person_name/person-name-engine.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_
+#define LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_
+
+#include "annotator/person_name/person-name-engine-dummy.h"
+
+#endif // LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_
diff --git a/native/annotator/person_name/person_name_model.fbs b/native/annotator/person_name/person_name_model.fbs
new file mode 100755
index 0000000..6421341
--- /dev/null
+++ b/native/annotator/person_name/person_name_model.fbs
@@ -0,0 +1,40 @@
+//
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+file_identifier "TC2 ";
+
+// Next ID: 2
+namespace libtextclassifier3.PersonNameModel_;
+table PersonName {
+ // Person name which is considered by the person name annotator. This
+ // attribute should contain 'atomic' person names, e.g., 'John' and 'Doe'
+ // should be two separate person names.
+ // required
+ person_name:string (shared);
+}
+
+// Next ID: 3
+namespace libtextclassifier3;
+table PersonNameModel {
+ // Decides if the person name annotator is enabled.
+ // required
+ enabled:bool;
+
+ // List of all person names which are considered by the person name annotator.
+ person_names:[PersonNameModel_.PersonName];
+}
+
+root_type libtextclassifier3.PersonNameModel;
diff --git a/native/annotator/test_data/test_model.fb b/native/annotator/test_data/test_model.fb
index 6bbb62e..9a14063 100644
--- a/native/annotator/test_data/test_model.fb
+++ b/native/annotator/test_data/test_model.fb
Binary files differ
diff --git a/native/annotator/test_data/test_person_name_model.fb b/native/annotator/test_data/test_person_name_model.fb
new file mode 100644
index 0000000..4752a23
--- /dev/null
+++ b/native/annotator/test_data/test_person_name_model.fb
Binary files differ
diff --git a/native/annotator/test_data/wrong_embeddings.fb b/native/annotator/test_data/wrong_embeddings.fb
index b25c70e..7c846a5 100644
--- a/native/annotator/test_data/wrong_embeddings.fb
+++ b/native/annotator/test_data/wrong_embeddings.fb
Binary files differ