diff options
author | Tony Mak <tonymak@google.com> | 2020-01-08 17:30:51 +0000 |
---|---|---|
committer | Tony Mak <tonymak@google.com> | 2020-01-08 21:39:22 +0000 |
commit | 76d8096f5f552ed3c11131c650ce72dc68a2e254 (patch) | |
tree | e7edeb0a629741de52b5117c53597ac35296bdc4 /native/annotator | |
parent | 1bcd2f6cb74ad9fb0fa468619c2f00232b2075f3 (diff) | |
download | libtextclassifier-76d8096f5f552ed3c11131c650ce72dc68a2e254.tar.gz |
Import libtextclassifier
Test: atest TextClassifierServiceTest
Change-Id: If6e67d41838426b7ff3451fa74c71780b788568c
Diffstat (limited to 'native/annotator')
-rw-r--r-- | native/annotator/annotator.cc | 85 | ||||
-rw-r--r-- | native/annotator/annotator.h | 7 | ||||
-rw-r--r-- | native/annotator/annotator_jni.cc | 12 | ||||
-rw-r--r-- | native/annotator/annotator_jni.h | 4 | ||||
-rw-r--r-- | native/annotator/collections.h | 5 | ||||
-rwxr-xr-x | native/annotator/entity-data.fbs | 14 | ||||
-rwxr-xr-x | native/annotator/model.fbs | 14 | ||||
-rw-r--r-- | native/annotator/person_name/person-name-engine-dummy.h | 56 | ||||
-rw-r--r-- | native/annotator/person_name/person-name-engine.h | 22 | ||||
-rwxr-xr-x | native/annotator/person_name/person_name_model.fbs | 40 | ||||
-rw-r--r-- | native/annotator/test_data/test_model.fb | bin | 657224 -> 682912 bytes | |||
-rw-r--r-- | native/annotator/test_data/test_person_name_model.fb | bin | 0 -> 64 bytes | |||
-rw-r--r-- | native/annotator/test_data/wrong_embeddings.fb | bin | 395504 -> 424444 bytes |
13 files changed, 254 insertions, 5 deletions
diff --git a/native/annotator/annotator.cc b/native/annotator/annotator.cc index abb57e8..8969026 100644 --- a/native/annotator/annotator.cc +++ b/native/annotator/annotator.cc @@ -61,6 +61,16 @@ const Model* LoadAndVerifyModel(const void* addr, int size) { } } +const PersonNameModel* LoadAndVerifyPersonNameModel(const void* addr, + int size) { + flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size); + if (VerifyPersonNameModelBuffer(verifier)) { + return GetPersonNameModel(addr); + } else { + return nullptr; + } +} + // If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will // create a new instance, assign ownership to owned_lib, and return it. const UniLib* MaybeCreateUnilib(const UniLib* lib, @@ -548,6 +558,37 @@ bool Annotator::InitializeInstalledAppEngine( return true; } +bool Annotator::InitializePersonNameEngineFromFileDescriptor(int fd, int offset, + int size) { + std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size)); + + if (!mmap->handle().ok()) { + TC3_LOG(ERROR) << "Mmap for person name model failed."; + return false; + } + + const PersonNameModel* person_name_model = LoadAndVerifyPersonNameModel( + mmap->handle().start(), mmap->handle().num_bytes()); + + if (person_name_model == nullptr) { + TC3_LOG(ERROR) << "Person name model verification failed."; + return false; + } + + if (!person_name_model->enabled()) { + return true; + } + + std::unique_ptr<PersonNameEngine> person_name_engine( + new PersonNameEngine(unilib_)); + if (!person_name_engine->Initialize(person_name_model)) { + TC3_LOG(ERROR) << "Failed to initialize the person name engine."; + return false; + } + person_name_engine_ = std::move(person_name_engine); + return true; +} + namespace { int CountDigits(const std::string& str, CodepointSpan selection_indices) { @@ -775,6 +816,11 @@ CodepointSpan Annotator::SuggestSelection( TC3_LOG(ERROR) << "Duration annotator failed in suggest selection."; return original_click_indices; } + if (person_name_engine_ != nullptr && + !person_name_engine_->Chunk(context_unicode, tokens, &candidates)) { + TC3_LOG(ERROR) << "Person name suggest selection failed."; + return original_click_indices; + } // Sort candidates according to their position in the input, so that the next // code can assume that any connected component of overlapping spans forms a @@ -930,11 +976,13 @@ bool Annotator::ResolveConflict( InterpreterManager* interpreter_manager, std::vector<int>* chosen_indices) const { std::vector<int> conflicting_indices; - std::unordered_map<int, float> scores; + std::unordered_map<int, std::pair<float, int>> scores_lengths; for (int i = start_index; i < end_index; ++i) { conflicting_indices.push_back(i); if (!candidates[i].classification.empty()) { - scores[i] = GetPriorityScore(candidates[i].classification); + scores_lengths[i] = { + GetPriorityScore(candidates[i].classification), + candidates[i].span.second - candidates[i].span.first}; continue; } @@ -951,12 +999,23 @@ bool Annotator::ResolveConflict( } if (!classification.empty()) { - scores[i] = GetPriorityScore(classification); + scores_lengths[i] = { + GetPriorityScore(classification), + candidates[i].span.second - candidates[i].span.first}; } } - std::sort(conflicting_indices.begin(), conflicting_indices.end(), - [&scores](int i, int j) { return scores[i] > scores[j]; }); + std::sort( + conflicting_indices.begin(), conflicting_indices.end(), + [&scores_lengths, candidates, conflicting_indices, this](int i, int j) { + if (scores_lengths[i].first == scores_lengths[j].first && + this->model_->triggering_options() != nullptr && + this->model_->triggering_options() + ->prioritize_longest_annotation()) { + return scores_lengths[i].second > scores_lengths[j].second; + } + return scores_lengths[i].first > scores_lengths[j].first; + }); // Here we keep a set of indices that were chosen, per-source, to enable // effective computation. @@ -1559,6 +1618,14 @@ std::vector<ClassificationResult> Annotator::ClassifyText( candidates.push_back({selection_indices, {contact_result}}); } + // Try the person name engine. + ClassificationResult person_name_result; + if (person_name_engine_ && + person_name_engine_->ClassifyText(context, selection_indices, + &person_name_result)) { + candidates.push_back({selection_indices, {person_name_result}}); + } + // Try the installed app engine. // TODO(b/126579108): Propagate error status. ClassificationResult installed_app_result; @@ -1923,6 +1990,14 @@ std::vector<AnnotatedSpan> Annotator::Annotate( return {}; } + // Annotate with the person name engine. + if (is_entity_type_enabled(Collections::PersonName()) && + person_name_engine_ && + !person_name_engine_->Chunk(context_unicode, tokens, &candidates)) { + TC3_LOG(ERROR) << "Couldn't run person name engine Chunk."; + return {}; + } + // Sort candidates according to their position in the input, so that the next // code can assume that any connected component of overlapping spans forms a // contiguous block. diff --git a/native/annotator/annotator.h b/native/annotator/annotator.h index dabd894..606d0bb 100644 --- a/native/annotator/annotator.h +++ b/native/annotator/annotator.h @@ -34,6 +34,7 @@ #include "annotator/model-executor.h" #include "annotator/model_generated.h" #include "annotator/number/number.h" +#include "annotator/person_name/person-name-engine.h" #include "annotator/strip-unpaired-brackets.h" #include "annotator/types.h" #include "annotator/zlib-utils.h" @@ -226,6 +227,11 @@ class Annotator { // Initializes the installed app engine with the given config. bool InitializeInstalledAppEngine(const std::string& serialized_config); + // Initializes the person name engine with the given person name model in the + // provided file descriptor. + bool InitializePersonNameEngineFromFileDescriptor(int fd, int offset, + int size); + // Runs inference for given a context and current selection (i.e. index // of the first and one past last selected characters (utf8 codepoint // offsets)). Returns the indices (utf8 codepoint offsets) of the selection @@ -516,6 +522,7 @@ class Annotator { std::unique_ptr<const InstalledAppEngine> installed_app_engine_; std::unique_ptr<const NumberAnnotator> number_annotator_; std::unique_ptr<const DurationAnnotator> duration_annotator_; + std::unique_ptr<const PersonNameEngine> person_name_engine_; // Builder for creating extra data. const reflection::Schema* entity_data_schema_; diff --git a/native/annotator/annotator_jni.cc b/native/annotator/annotator_jni.cc index 5c42c3e..e0c6262 100644 --- a/native/annotator/annotator_jni.cc +++ b/native/annotator/annotator_jni.cc @@ -560,6 +560,18 @@ TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME, return model->InitializeInstalledAppEngine(serialized_config_string); } +TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME, + nativeInitializePersonNameEngine) +(JNIEnv* env, jobject thiz, jlong ptr, jint fd, jlong offset, jlong size) { + if (!ptr) { + return false; + } + + Annotator* model = reinterpret_cast<AnnotatorJniContext*>(ptr)->model(); + + return model->InitializePersonNameEngineFromFileDescriptor(fd, offset, size); +} + TC3_JNI_METHOD(jlong, TC3_ANNOTATOR_CLASS_NAME, nativeGetNativeModelPtr) (JNIEnv* env, jobject thiz, jlong ptr) { if (!ptr) { diff --git a/native/annotator/annotator_jni.h b/native/annotator/annotator_jni.h index 0789e76..893f84e 100644 --- a/native/annotator/annotator_jni.h +++ b/native/annotator/annotator_jni.h @@ -49,6 +49,10 @@ TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME, nativeInitializeInstalledAppEngine) (JNIEnv* env, jobject thiz, jlong ptr, jbyteArray serialized_config); +TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME, + nativeInitializePersonNameEngine) +(JNIEnv* env, jobject thiz, jlong ptr, jint fd, jlong offset, jlong size); + TC3_JNI_METHOD(jlong, TC3_ANNOTATOR_CLASS_NAME, nativeGetNativeModelPtr) (JNIEnv* env, jobject thiz, jlong ptr); diff --git a/native/annotator/collections.h b/native/annotator/collections.h index 0b75463..7783b41 100644 --- a/native/annotator/collections.h +++ b/native/annotator/collections.h @@ -114,6 +114,11 @@ class Collections { *[]() { return new std::string("percentage"); }(); return value; } + static const std::string& PersonName() { + static const std::string& value = + *[]() { return new std::string("person_name"); }(); + return value; + } static const std::string& Phone() { static const std::string& value = *[]() { return new std::string("phone"); }(); diff --git a/native/annotator/entity-data.fbs b/native/annotator/entity-data.fbs index 0163bfd..59a213f 100755 --- a/native/annotator/entity-data.fbs +++ b/native/annotator/entity-data.fbs @@ -166,6 +166,19 @@ table ParcelTracking { tracking_number:string (shared); } +// Parsed money amount. +namespace libtextclassifier3.EntityData_; +table Money { + // String representation of currency, unnormalized. + unnormalized_currency:string (shared); + + // Whole part of the amount (e.g. 123 from "CHF 123.45"). + amount_whole_part:int; + + // Decimal part of the amount (e.g. 45 from "CHF 123.45"). + amount_decimal_part:int; +} + // Represents an entity annotated in text. namespace libtextclassifier3; table EntityData { @@ -187,6 +200,7 @@ table EntityData { isbn:EntityData_.Isbn; iban:EntityData_.Iban; parcel:EntityData_.ParcelTracking; + money:EntityData_.Money; } root_type libtextclassifier3.EntityData; diff --git a/native/annotator/model.fbs b/native/annotator/model.fbs index 6b7b007..e765ef6 100755 --- a/native/annotator/model.fbs +++ b/native/annotator/model.fbs @@ -338,6 +338,16 @@ table DatetimeModel { prefer_future_for_unspecified_date:bool = false; } +namespace libtextclassifier3; +table GrammarDatetimeModel { + // List of BCP 47 locale strings representing all locales supported by the + // model. + locales:[string]; + + // If true, will give only future dates (when the day is not specified). + prefer_future_for_unspecified_date:bool = false; +} + namespace libtextclassifier3.DatetimeModelLibrary_; table Item { key:string (shared); @@ -373,6 +383,9 @@ table ModelTriggeringOptions { // Priority score assigned to knowledge engine annotations. knowledge_priority_score:float = 0; + + // If true, will prioritize the longest annotation during conflict resolution. + prioritize_longest_annotation:bool = false; } // Options controlling the output of the classifier. @@ -468,6 +481,7 @@ table Model { triggering_locales:string (shared); embedding_pruning_mask:Model_.EmbeddingPruningMask; + grammar_datetime_model:GrammarDatetimeModel; } // Method for selecting the center token. diff --git a/native/annotator/person_name/person-name-engine-dummy.h b/native/annotator/person_name/person-name-engine-dummy.h new file mode 100644 index 0000000..91ae2e5 --- /dev/null +++ b/native/annotator/person_name/person-name-engine-dummy.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_ +#define LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_ + +#include <string> +#include <vector> + +#include "annotator/feature-processor.h" +#include "annotator/person_name/person_name_model_generated.h" +#include "annotator/types.h" +#include "utils/base/logging.h" +#include "utils/utf8/unicodetext.h" +#include "utils/utf8/unilib.h" + +namespace libtextclassifier3 { + +// A dummy implementation of the person name engine. +class PersonNameEngine { + public: + explicit PersonNameEngine(const UniLib* unilib) {} + + bool Initialize(const PersonNameModel* model) { + TC3_LOG(ERROR) << "No person name engine to initialize."; + return false; + } + + bool ClassifyText(const std::string& context, CodepointSpan selection_indices, + ClassificationResult* classification_result) const { + return false; + } + + bool Chunk(const UnicodeText& context_unicode, + const std::vector<Token>& tokens, + std::vector<AnnotatedSpan>* result) const { + return true; + } +}; + +} // namespace libtextclassifier3 + +#endif // LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_ diff --git a/native/annotator/person_name/person-name-engine.h b/native/annotator/person_name/person-name-engine.h new file mode 100644 index 0000000..988fce3 --- /dev/null +++ b/native/annotator/person_name/person-name-engine.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_ +#define LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_ + +#include "annotator/person_name/person-name-engine-dummy.h" + +#endif // LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_ diff --git a/native/annotator/person_name/person_name_model.fbs b/native/annotator/person_name/person_name_model.fbs new file mode 100755 index 0000000..6421341 --- /dev/null +++ b/native/annotator/person_name/person_name_model.fbs @@ -0,0 +1,40 @@ +// +// Copyright (C) 2018 The Android Open Source Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +file_identifier "TC2 "; + +// Next ID: 2 +namespace libtextclassifier3.PersonNameModel_; +table PersonName { + // Person name which is considered by the person name annotator. This + // attribute should contain 'atomic' person names, e.g., 'John' and 'Doe' + // should be two separate person names. + // required + person_name:string (shared); +} + +// Next ID: 3 +namespace libtextclassifier3; +table PersonNameModel { + // Decides if the person name annotator is enabled. + // required + enabled:bool; + + // List of all person names which are considered by the person name annotator. + person_names:[PersonNameModel_.PersonName]; +} + +root_type libtextclassifier3.PersonNameModel; diff --git a/native/annotator/test_data/test_model.fb b/native/annotator/test_data/test_model.fb Binary files differindex 6bbb62e..9a14063 100644 --- a/native/annotator/test_data/test_model.fb +++ b/native/annotator/test_data/test_model.fb diff --git a/native/annotator/test_data/test_person_name_model.fb b/native/annotator/test_data/test_person_name_model.fb Binary files differnew file mode 100644 index 0000000..4752a23 --- /dev/null +++ b/native/annotator/test_data/test_person_name_model.fb diff --git a/native/annotator/test_data/wrong_embeddings.fb b/native/annotator/test_data/wrong_embeddings.fb Binary files differindex b25c70e..7c846a5 100644 --- a/native/annotator/test_data/wrong_embeddings.fb +++ b/native/annotator/test_data/wrong_embeddings.fb |