Import libtextclassifier

Test: atest TextClassifierServiceTest Change-Id: If6e67d41838426b7ff3451fa74c71780b788568c
author: Tony Mak <tonymak@google.com> 2020-01-08 17:30:51 +0000
committer: Tony Mak <tonymak@google.com> 2020-01-08 21:39:22 +0000
commit: 76d8096f5f552ed3c11131c650ce72dc68a2e254 (patch)
tree: e7edeb0a629741de52b5117c53597ac35296bdc4 /native/annotator
parent: 1bcd2f6cb74ad9fb0fa468619c2f00232b2075f3 (diff)
download: libtextclassifier-76d8096f5f552ed3c11131c650ce72dc68a2e254.tar.gz
13 files changed, 254 insertions, 5 deletions
diff --git a/native/annotator/annotator.cc b/native/annotator/annotator.cc
index abb57e8..8969026 100644
--- a/native/annotator/annotator.cc
+++ b/native/annotator/annotator.cc
@@ -61,6 +61,16 @@ const Model* LoadAndVerifyModel(const void* addr, int size) {
   }
 }
 
+const PersonNameModel* LoadAndVerifyPersonNameModel(const void* addr,
+                                                    int size) {
+  flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);
+  if (VerifyPersonNameModelBuffer(verifier)) {
+    return GetPersonNameModel(addr);
+  } else {
+    return nullptr;
+  }
+}
+
 // If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will
 // create a new instance, assign ownership to owned_lib, and return it.
 const UniLib* MaybeCreateUnilib(const UniLib* lib,
@@ -548,6 +558,37 @@ bool Annotator::InitializeInstalledAppEngine(
   return true;
 }
 
+bool Annotator::InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
+                                                             int size) {
+  std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));
+
+  if (!mmap->handle().ok()) {
+    TC3_LOG(ERROR) << "Mmap for person name model failed.";
+    return false;
+  }
+
+  const PersonNameModel* person_name_model = LoadAndVerifyPersonNameModel(
+      mmap->handle().start(), mmap->handle().num_bytes());
+
+  if (person_name_model == nullptr) {
+    TC3_LOG(ERROR) << "Person name model verification failed.";
+    return false;
+  }
+
+  if (!person_name_model->enabled()) {
+    return true;
+  }
+
+  std::unique_ptr<PersonNameEngine> person_name_engine(
+      new PersonNameEngine(unilib_));
+  if (!person_name_engine->Initialize(person_name_model)) {
+    TC3_LOG(ERROR) << "Failed to initialize the person name engine.";
+    return false;
+  }
+  person_name_engine_ = std::move(person_name_engine);
+  return true;
+}
+
 namespace {
 
 int CountDigits(const std::string& str, CodepointSpan selection_indices) {
@@ -775,6 +816,11 @@ CodepointSpan Annotator::SuggestSelection(
     TC3_LOG(ERROR) << "Duration annotator failed in suggest selection.";
     return original_click_indices;
   }
+  if (person_name_engine_ != nullptr &&
+      !person_name_engine_->Chunk(context_unicode, tokens, &candidates)) {
+    TC3_LOG(ERROR) << "Person name suggest selection failed.";
+    return original_click_indices;
+  }
 
   // Sort candidates according to their position in the input, so that the next
   // code can assume that any connected component of overlapping spans forms a
@@ -930,11 +976,13 @@ bool Annotator::ResolveConflict(
     InterpreterManager* interpreter_manager,
     std::vector<int>* chosen_indices) const {
   std::vector<int> conflicting_indices;
-  std::unordered_map<int, float> scores;
+  std::unordered_map<int, std::pair<float, int>> scores_lengths;
   for (int i = start_index; i < end_index; ++i) {
     conflicting_indices.push_back(i);
     if (!candidates[i].classification.empty()) {
-      scores[i] = GetPriorityScore(candidates[i].classification);
+      scores_lengths[i] = {
+          GetPriorityScore(candidates[i].classification),
+          candidates[i].span.second - candidates[i].span.first};
       continue;
     }
 
@@ -951,12 +999,23 @@ bool Annotator::ResolveConflict(
     }
 
     if (!classification.empty()) {
-      scores[i] = GetPriorityScore(classification);
+      scores_lengths[i] = {
+          GetPriorityScore(classification),
+          candidates[i].span.second - candidates[i].span.first};
     }
   }
 
-  std::sort(conflicting_indices.begin(), conflicting_indices.end(),
-            [&scores](int i, int j) { return scores[i] > scores[j]; });
+  std::sort(
+      conflicting_indices.begin(), conflicting_indices.end(),
+      [&scores_lengths, candidates, conflicting_indices, this](int i, int j) {
+        if (scores_lengths[i].first == scores_lengths[j].first &&
+            this->model_->triggering_options() != nullptr &&
+            this->model_->triggering_options()
+                ->prioritize_longest_annotation()) {
+          return scores_lengths[i].second > scores_lengths[j].second;
+        }
+        return scores_lengths[i].first > scores_lengths[j].first;
+      });
 
   // Here we keep a set of indices that were chosen, per-source, to enable
   // effective computation.
@@ -1559,6 +1618,14 @@ std::vector<ClassificationResult> Annotator::ClassifyText(
     candidates.push_back({selection_indices, {contact_result}});
   }
 
+  // Try the person name engine.
+  ClassificationResult person_name_result;
+  if (person_name_engine_ &&
+      person_name_engine_->ClassifyText(context, selection_indices,
+                                        &person_name_result)) {
+    candidates.push_back({selection_indices, {person_name_result}});
+  }
+
   // Try the installed app engine.
   // TODO(b/126579108): Propagate error status.
   ClassificationResult installed_app_result;
@@ -1923,6 +1990,14 @@ std::vector<AnnotatedSpan> Annotator::Annotate(
     return {};
   }
 
+  // Annotate with the person name engine.
+  if (is_entity_type_enabled(Collections::PersonName()) &&
+      person_name_engine_ &&
+      !person_name_engine_->Chunk(context_unicode, tokens, &candidates)) {
+    TC3_LOG(ERROR) << "Couldn't run person name engine Chunk.";
+    return {};
+  }
+
   // Sort candidates according to their position in the input, so that the next
   // code can assume that any connected component of overlapping spans forms a
   // contiguous block.
diff --git a/native/annotator/annotator.h b/native/annotator/annotator.h
index dabd894..606d0bb 100644
--- a/native/annotator/annotator.h
+++ b/native/annotator/annotator.h
@@ -34,6 +34,7 @@
 #include "annotator/model-executor.h"
 #include "annotator/model_generated.h"
 #include "annotator/number/number.h"
+#include "annotator/person_name/person-name-engine.h"
 #include "annotator/strip-unpaired-brackets.h"
 #include "annotator/types.h"
 #include "annotator/zlib-utils.h"
@@ -226,6 +227,11 @@ class Annotator {
   // Initializes the installed app engine with the given config.
   bool InitializeInstalledAppEngine(const std::string& serialized_config);
 
+  // Initializes the person name engine with the given person name model in the
+  // provided file descriptor.
+  bool InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
+                                                    int size);
+
   // Runs inference for given a context and current selection (i.e. index
   // of the first and one past last selected characters (utf8 codepoint
   // offsets)). Returns the indices (utf8 codepoint offsets) of the selection
@@ -516,6 +522,7 @@ class Annotator {
   std::unique_ptr<const InstalledAppEngine> installed_app_engine_;
   std::unique_ptr<const NumberAnnotator> number_annotator_;
   std::unique_ptr<const DurationAnnotator> duration_annotator_;
+  std::unique_ptr<const PersonNameEngine> person_name_engine_;
 
   // Builder for creating extra data.
   const reflection::Schema* entity_data_schema_;
diff --git a/native/annotator/annotator_jni.cc b/native/annotator/annotator_jni.cc
index 5c42c3e..e0c6262 100644
--- a/native/annotator/annotator_jni.cc
+++ b/native/annotator/annotator_jni.cc
@@ -560,6 +560,18 @@ TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME,
   return model->InitializeInstalledAppEngine(serialized_config_string);
 }
 
+TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME,
+               nativeInitializePersonNameEngine)
+(JNIEnv* env, jobject thiz, jlong ptr, jint fd, jlong offset, jlong size) {
+  if (!ptr) {
+    return false;
+  }
+
+  Annotator* model = reinterpret_cast<AnnotatorJniContext*>(ptr)->model();
+
+  return model->InitializePersonNameEngineFromFileDescriptor(fd, offset, size);
+}
+
 TC3_JNI_METHOD(jlong, TC3_ANNOTATOR_CLASS_NAME, nativeGetNativeModelPtr)
 (JNIEnv* env, jobject thiz, jlong ptr) {
   if (!ptr) {
diff --git a/native/annotator/annotator_jni.h b/native/annotator/annotator_jni.h
index 0789e76..893f84e 100644
--- a/native/annotator/annotator_jni.h
+++ b/native/annotator/annotator_jni.h
@@ -49,6 +49,10 @@ TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME,
                nativeInitializeInstalledAppEngine)
 (JNIEnv* env, jobject thiz, jlong ptr, jbyteArray serialized_config);
 
+TC3_JNI_METHOD(jboolean, TC3_ANNOTATOR_CLASS_NAME,
+               nativeInitializePersonNameEngine)
+(JNIEnv* env, jobject thiz, jlong ptr, jint fd, jlong offset, jlong size);
+
 TC3_JNI_METHOD(jlong, TC3_ANNOTATOR_CLASS_NAME, nativeGetNativeModelPtr)
 (JNIEnv* env, jobject thiz, jlong ptr);
 
diff --git a/native/annotator/collections.h b/native/annotator/collections.h
index 0b75463..7783b41 100644
--- a/native/annotator/collections.h
+++ b/native/annotator/collections.h
@@ -114,6 +114,11 @@ class Collections {
         *[]() { return new std::string("percentage"); }();
     return value;
   }
+  static const std::string& PersonName() {
+    static const std::string& value =
+        *[]() { return new std::string("person_name"); }();
+    return value;
+  }
   static const std::string& Phone() {
     static const std::string& value =
         *[]() { return new std::string("phone"); }();
diff --git a/native/annotator/entity-data.fbs b/native/annotator/entity-data.fbs
index 0163bfd..59a213f 100755
--- a/native/annotator/entity-data.fbs
+++ b/native/annotator/entity-data.fbs
@@ -166,6 +166,19 @@ table ParcelTracking {
   tracking_number:string (shared);
 }
 
+// Parsed money amount.
+namespace libtextclassifier3.EntityData_;
+table Money {
+  // String representation of currency, unnormalized.
+  unnormalized_currency:string (shared);
+
+  // Whole part of the amount (e.g. 123 from "CHF 123.45").
+  amount_whole_part:int;
+
+  // Decimal part of the amount (e.g. 45 from "CHF 123.45").
+  amount_decimal_part:int;
+}
+
 // Represents an entity annotated in text.
 namespace libtextclassifier3;
 table EntityData {
@@ -187,6 +200,7 @@ table EntityData {
   isbn:EntityData_.Isbn;
   iban:EntityData_.Iban;
   parcel:EntityData_.ParcelTracking;
+  money:EntityData_.Money;
 }
 
 root_type libtextclassifier3.EntityData;
diff --git a/native/annotator/model.fbs b/native/annotator/model.fbs
index 6b7b007..e765ef6 100755
--- a/native/annotator/model.fbs
+++ b/native/annotator/model.fbs
@@ -338,6 +338,16 @@ table DatetimeModel {
   prefer_future_for_unspecified_date:bool = false;
 }
 
+namespace libtextclassifier3;
+table GrammarDatetimeModel {
+  // List of BCP 47 locale strings representing all locales supported by the
+  // model.
+  locales:[string];
+
+  // If true, will give only future dates (when the day is not specified).
+  prefer_future_for_unspecified_date:bool = false;
+}
+
 namespace libtextclassifier3.DatetimeModelLibrary_;
 table Item {
   key:string (shared);
@@ -373,6 +383,9 @@ table ModelTriggeringOptions {
 
   // Priority score assigned to knowledge engine annotations.
   knowledge_priority_score:float = 0;
+
+  // If true, will prioritize the longest annotation during conflict resolution.
+  prioritize_longest_annotation:bool = false;
 }
 
 // Options controlling the output of the classifier.
@@ -468,6 +481,7 @@ table Model {
   triggering_locales:string (shared);
 
   embedding_pruning_mask:Model_.EmbeddingPruningMask;
+  grammar_datetime_model:GrammarDatetimeModel;
 }
 
 // Method for selecting the center token.
diff --git a/native/annotator/person_name/person-name-engine-dummy.h b/native/annotator/person_name/person-name-engine-dummy.h
new file mode 100644
index 0000000..91ae2e5
--- /dev/null
+++ b/native/annotator/person_name/person-name-engine-dummy.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_
+#define LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_
+
+#include <string>
+#include <vector>
+
+#include "annotator/feature-processor.h"
+#include "annotator/person_name/person_name_model_generated.h"
+#include "annotator/types.h"
+#include "utils/base/logging.h"
+#include "utils/utf8/unicodetext.h"
+#include "utils/utf8/unilib.h"
+
+namespace libtextclassifier3 {
+
+// A dummy implementation of the person name engine.
+class PersonNameEngine {
+ public:
+  explicit PersonNameEngine(const UniLib* unilib) {}
+
+  bool Initialize(const PersonNameModel* model) {
+    TC3_LOG(ERROR) << "No person name engine to initialize.";
+    return false;
+  }
+
+  bool ClassifyText(const std::string& context, CodepointSpan selection_indices,
+                    ClassificationResult* classification_result) const {
+    return false;
+  }
+
+  bool Chunk(const UnicodeText& context_unicode,
+             const std::vector<Token>& tokens,
+             std::vector<AnnotatedSpan>* result) const {
+    return true;
+  }
+};
+
+}  // namespace libtextclassifier3
+
+#endif  // LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_DUMMY_H_
diff --git a/native/annotator/person_name/person-name-engine.h b/native/annotator/person_name/person-name-engine.h
new file mode 100644
index 0000000..988fce3
--- /dev/null
+++ b/native/annotator/person_name/person-name-engine.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_
+#define LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_
+
+#include "annotator/person_name/person-name-engine-dummy.h"
+
+#endif  // LIBTEXTCLASSIFIER_ANNOTATOR_PERSON_NAME_PERSON_NAME_ENGINE_H_
diff --git a/native/annotator/person_name/person_name_model.fbs b/native/annotator/person_name/person_name_model.fbs
new file mode 100755
index 0000000..6421341
--- /dev/null
+++ b/native/annotator/person_name/person_name_model.fbs
@@ -0,0 +1,40 @@
+//
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+file_identifier "TC2 ";
+
+// Next ID: 2
+namespace libtextclassifier3.PersonNameModel_;
+table PersonName {
+  // Person name which is considered by the person name annotator. This
+  // attribute should contain 'atomic' person names, e.g., 'John' and 'Doe'
+  // should be two separate person names.
+  // required
+  person_name:string (shared);
+}
+
+// Next ID: 3
+namespace libtextclassifier3;
+table PersonNameModel {
+  // Decides if the person name annotator is enabled.
+  // required
+  enabled:bool;
+
+  // List of all person names which are considered by the person name annotator.
+  person_names:[PersonNameModel_.PersonName];
+}
+
+root_type libtextclassifier3.PersonNameModel;
diff --git a/native/annotator/test_data/test_model.fb b/native/annotator/test_data/test_model.fb
index 6bbb62e..9a14063 100644
--- a/native/annotator/test_data/test_model.fb
+++ b/native/annotator/test_data/test_model.fb
diff --git a/native/annotator/test_data/test_person_name_model.fb b/native/annotator/test_data/test_person_name_model.fb
new file mode 100644
index 0000000..4752a23
--- /dev/null
+++ b/native/annotator/test_data/test_person_name_model.fb
diff --git a/native/annotator/test_data/wrong_embeddings.fb b/native/annotator/test_data/wrong_embeddings.fb
index b25c70e..7c846a5 100644
--- a/native/annotator/test_data/wrong_embeddings.fb
+++ b/native/annotator/test_data/wrong_embeddings.fb
author	Tony Mak <tonymak@google.com>	2020-01-08 17:30:51 +0000
committer	Tony Mak <tonymak@google.com>	2020-01-08 21:39:22 +0000
commit	76d8096f5f552ed3c11131c650ce72dc68a2e254 (patch)
tree	e7edeb0a629741de52b5117c53597ac35296bdc4 /native/annotator
parent	1bcd2f6cb74ad9fb0fa468619c2f00232b2075f3 (diff)
download	libtextclassifier-76d8096f5f552ed3c11131c650ce72dc68a2e254.tar.gz