summaryrefslogtreecommitdiff
path: root/native/annotator/annotator.h
blob: d69fe32a853e1c9b1b8801698bd38f94cfa189e9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Inference code for the text classification model.

#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
#define LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_

#include <memory>
#include <set>
#include <string>
#include <unordered_set>
#include <vector>

#include "annotator/contact/contact-engine.h"
#include "annotator/datetime/datetime-grounder.h"
#include "annotator/datetime/parser.h"
#include "annotator/duration/duration.h"
#include "annotator/experimental/experimental.h"
#include "annotator/feature-processor.h"
#include "annotator/grammar/grammar-annotator.h"
#include "annotator/installed_app/installed-app-engine.h"
#include "annotator/knowledge/knowledge-engine.h"
#include "annotator/model-executor.h"
#include "annotator/model_generated.h"
#include "annotator/number/number.h"
#include "annotator/person_name/person-name-engine.h"
#include "annotator/pod_ner/pod-ner.h"
#include "annotator/strip-unpaired-brackets.h"
#include "annotator/translate/translate.h"
#include "annotator/types.h"
#include "annotator/vocab/vocab-annotator.h"
#include "annotator/zlib-utils.h"
#include "utils/base/status.h"
#include "utils/base/statusor.h"
#include "utils/calendar/calendar.h"
#include "utils/flatbuffers/flatbuffers.h"
#include "utils/flatbuffers/mutable.h"
#include "utils/i18n/locale.h"
#include "utils/memory/mmap.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib.h"
#include "utils/zlib/zlib.h"
#include "lang_id/lang-id.h"

namespace libtextclassifier3 {

// Holds TFLite interpreters for selection and classification models.
// NOTE: This class is not thread-safe, thus should NOT be re-used across
// threads.
class InterpreterManager {
 public:
  // The constructor can be called with nullptr for any of the executors, and is
  // a defined behavior, as long as the corresponding *Interpreter() method is
  // not called when the executor is null.
  InterpreterManager(const ModelExecutor* selection_executor,
                     const ModelExecutor* classification_executor)
      : selection_executor_(selection_executor),
        classification_executor_(classification_executor) {}

  // Gets or creates and caches an interpreter for the selection model.
  tflite::Interpreter* SelectionInterpreter();

  // Gets or creates and caches an interpreter for the classification model.
  tflite::Interpreter* ClassificationInterpreter();

 private:
  const ModelExecutor* selection_executor_;
  const ModelExecutor* classification_executor_;

  std::unique_ptr<tflite::Interpreter> selection_interpreter_;
  std::unique_ptr<tflite::Interpreter> classification_interpreter_;
};

// Stores entity types enabled for annotation, and provides operator() for
// checking whether a given entity type is enabled.
class EnabledEntityTypes {
 public:
  explicit EnabledEntityTypes(
      const std::unordered_set<std::string>& entity_types)
      : entity_types_(entity_types) {}

  bool operator()(const std::string& entity_type) const {
    return entity_types_.empty() ||
           entity_types_.find(entity_type) != entity_types_.cend();
  }

 private:
  const std::unordered_set<std::string>& entity_types_;
};

// A text processing model that provides text classification, annotation,
// selection suggestion for various types.
// NOTE: This class is not thread-safe.
class Annotator {
 public:
  static std::unique_ptr<Annotator> FromUnownedBuffer(
      const char* buffer, int size, const UniLib* unilib = nullptr,
      const CalendarLib* calendarlib = nullptr);
  // Copies the underlying model buffer string.
  static std::unique_ptr<Annotator> FromString(
      const std::string& buffer, const UniLib* unilib = nullptr,
      const CalendarLib* calendarlib = nullptr);
  // Takes ownership of the mmap.
  static std::unique_ptr<Annotator> FromScopedMmap(
      std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib = nullptr,
      const CalendarLib* calendarlib = nullptr);
  static std::unique_ptr<Annotator> FromScopedMmap(
      std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,
      std::unique_ptr<CalendarLib> calendarlib);
  static std::unique_ptr<Annotator> FromFileDescriptor(
      int fd, int offset, int size, const UniLib* unilib = nullptr,
      const CalendarLib* calendarlib = nullptr);
  static std::unique_ptr<Annotator> FromFileDescriptor(
      int fd, int offset, int size, std::unique_ptr<UniLib> unilib,
      std::unique_ptr<CalendarLib> calendarlib);
  static std::unique_ptr<Annotator> FromFileDescriptor(
      int fd, const UniLib* unilib = nullptr,
      const CalendarLib* calendarlib = nullptr);
  static std::unique_ptr<Annotator> FromFileDescriptor(
      int fd, std::unique_ptr<UniLib> unilib,
      std::unique_ptr<CalendarLib> calendarlib);
  static std::unique_ptr<Annotator> FromPath(
      const std::string& path, const UniLib* unilib = nullptr,
      const CalendarLib* calendarlib = nullptr);
  static std::unique_ptr<Annotator> FromPath(
      const std::string& path, std::unique_ptr<UniLib> unilib,
      std::unique_ptr<CalendarLib> calendarlib);

  // Returns true if the model is ready for use.
  bool IsInitialized() { return initialized_; }

  // Initializes the knowledge engine with the given config.
  bool InitializeKnowledgeEngine(const std::string& serialized_config);

  // Initializes the contact engine with the given config.
  bool InitializeContactEngine(const std::string& serialized_config);

  // Initializes the installed app engine with the given config.
  bool InitializeInstalledAppEngine(const std::string& serialized_config);

  // Initializes the person name engine with the given person name model in the
  // provided buffer. The buffer needs to outlive the annotator.
  bool InitializePersonNameEngineFromUnownedBuffer(const void* buffer,
                                                   int size);

  // Initializes the person name engine with the given person name model from
  // the provided mmap.
  bool InitializePersonNameEngineFromScopedMmap(const ScopedMmap& mmap);

  // Initializes the person name engine with the given person name model in the
  // provided file path.
  bool InitializePersonNameEngineFromPath(const std::string& path);

  // Initializes the person name engine with the given person name model in the
  // provided file descriptor.
  bool InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
                                                    int size);

  // Initializes the experimental annotators if available.
  // Returns true if there is an implementation of experimental annotators
  // linked in.
  bool InitializeExperimentalAnnotators();

  // Sets up the lang-id instance that should be used.
  bool SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id);

  // Runs inference for given a context and current selection (i.e. index
  // of the first and one past last selected characters (utf8 codepoint
  // offsets)). Returns the indices (utf8 codepoint offsets) of the selection
  // beginning character and one past selection end character.
  // Returns the original click_indices if an error occurs.
  // NOTE: The selection indices are passed in and returned in terms of
  // UTF8 codepoints (not bytes).
  // Requires that the model is a smart selection model.
  CodepointSpan SuggestSelection(
      const std::string& context, CodepointSpan click_indices,
      const SelectionOptions& options = SelectionOptions()) const;

  // Classifies the selected text given the context string.
  // Returns an empty result if an error occurs.
  std::vector<ClassificationResult> ClassifyText(
      const std::string& context, const CodepointSpan& selection_indices,
      const ClassificationOptions& options = ClassificationOptions()) const;

  // Annotates the given structed input request. Models which handle the full
  // context request will receive all the metadata they require. While models
  // that don't use the extra context are called using only a string.
  // For each fragment the annotations are sorted by their position in
  // the fragment and exclude spans classified as 'other'.
  //
  // The number of vectors of annotated spans will match the number
  // of input fragments. The order of annotation span vectors will match the
  // order of input fragments. If annotation is not possible for any of the
  // annotators, no annotation is returned.
  StatusOr<Annotations> AnnotateStructuredInput(
      const std::vector<InputFragment>& string_fragments,
      const AnnotationOptions& options = AnnotationOptions()) const;

  // Annotates given input text. The annotations are sorted by their position
  // in the context string and exclude spans classified as 'other'.
  std::vector<AnnotatedSpan> Annotate(
      const std::string& context,
      const AnnotationOptions& options = AnnotationOptions()) const;

  // Looks up a knowledge entity by its id. Returns the serialized knowledge
  // result.
  StatusOr<std::string> LookUpKnowledgeEntity(const std::string& id) const;

  // Looks up an entity's property.
  StatusOr<std::string> LookUpKnowledgeEntityProperty(
      const std::string& mid_str, const std::string& property) const;

  const Model* model() const;
  const reflection::Schema* entity_data_schema() const;

  // Exposes the feature processor for tests and evaluations.
  const FeatureProcessor* SelectionFeatureProcessorForTests() const;
  const FeatureProcessor* ClassificationFeatureProcessorForTests() const;

  // Exposes the date time parser for tests and evaluations.
  const DatetimeParser* DatetimeParserForTests() const;

  static const std::string& kPhoneCollection;
  static const std::string& kAddressCollection;
  static const std::string& kDateCollection;
  static const std::string& kUrlCollection;
  static const std::string& kEmailCollection;

 protected:
  struct ScoredChunk {
    TokenSpan token_span;
    float score;
  };

  // NOTE: ValidateAndInitialize needs to be called before any other method.
  Annotator() : initialized_(false) {}

  // Checks that model contains all required fields, and initializes internal
  // datastructures.
  // Needs to be called before any other method is.
  void ValidateAndInitialize(const Model* model, const UniLib* unilib,
                             const CalendarLib* calendarlib);

  // Initializes regular expressions for the regex model.
  bool InitializeRegexModel(ZlibDecompressor* decompressor);

  // Resolves conflicts in the list of candidates by removing some overlapping
  // ones. Returns indices of the surviving ones.
  // NOTE: Assumes that the candidates are sorted according to their position in
  // the span.
  bool ResolveConflicts(const std::vector<AnnotatedSpan>& candidates,
                        const std::string& context,
                        const std::vector<Token>& cached_tokens,
                        const std::vector<Locale>& detected_text_language_tags,
                        const BaseOptions& options,
                        InterpreterManager* interpreter_manager,
                        std::vector<int>* result) const;

  // Resolves one conflict between candidates on indices 'start_index'
  // (inclusive) and 'end_index' (exclusive). Assigns the winning candidate
  // indices to 'chosen_indices'. Returns false if a problem arises.
  bool ResolveConflict(const std::string& context,
                       const std::vector<Token>& cached_tokens,
                       const std::vector<AnnotatedSpan>& candidates,
                       const std::vector<Locale>& detected_text_language_tags,
                       int start_index, int end_index,
                       const BaseOptions& options,
                       InterpreterManager* interpreter_manager,
                       std::vector<int>* chosen_indices) const;

  // Gets selection candidates from the ML model.
  // Provides the tokens produced during tokenization of the context string for
  // reuse.
  bool ModelSuggestSelection(
      const UnicodeText& context_unicode, const CodepointSpan& click_indices,
      const std::vector<Locale>& detected_text_language_tags,
      InterpreterManager* interpreter_manager, std::vector<Token>* tokens,
      std::vector<AnnotatedSpan>* result) const;

  // Classifies the selected text given the context string with the
  // classification model.
  // The following arguments are optional:
  //   - cached_tokens - can be given as empty
  //   - embedding_cache - can be given as nullptr
  //   - tokens - can be given as nullptr
  // Returns true if no error occurred.
  bool ModelClassifyText(
      const std::string& context, const std::vector<Token>& cached_tokens,
      const std::vector<Locale>& detected_text_language_tags,
      const CodepointSpan& selection_indices, const BaseOptions& options,
      InterpreterManager* interpreter_manager,
      FeatureProcessor::EmbeddingCache* embedding_cache,
      std::vector<ClassificationResult>* classification_results,
      std::vector<Token>* tokens) const;

  // Same as above, but (for optimization) takes the context as UnicodeText and
  // takes the following extra arguments:
  //   - span_begin, span_end - iterators in context_unicode corresponding to
  //     selection_indices
  //   - line - a UnicodeTextRange within context_unicode corresponding to the
  //     line containing the selection - optional, can be given as nullptr
  bool ModelClassifyText(
      const UnicodeText& context_unicode,
      const std::vector<Token>& cached_tokens,
      const std::vector<Locale>& detected_text_language_tags,
      const UnicodeText::const_iterator& span_begin,
      const UnicodeText::const_iterator& span_end, const UnicodeTextRange* line,
      const CodepointSpan& selection_indices, const BaseOptions& options,
      InterpreterManager* interpreter_manager,
      FeatureProcessor::EmbeddingCache* embedding_cache,
      std::vector<ClassificationResult>* classification_results,
      std::vector<Token>* tokens) const;

  // Returns a relative token span that represents how many tokens on the left
  // from the selection and right from the selection are needed for the
  // classifier input.
  TokenSpan ClassifyTextUpperBoundNeededTokens() const;

  // Classifies the selected text with the regular expressions models.
  // Returns true if no error happened, false otherwise.
  bool RegexClassifyText(
      const std::string& context, const CodepointSpan& selection_indices,
      std::vector<ClassificationResult>* classification_result) const;

  // Classifies the selected text with the date time model.
  // Returns true if no error happened, false otherwise.
  bool DatetimeClassifyText(
      const std::string& context, const CodepointSpan& selection_indices,
      const ClassificationOptions& options,
      std::vector<ClassificationResult>* classification_results) const;

  // Chunks given input text with the selection model and classifies the spans
  // with the classification model.
  // The annotations are sorted by their position in the context string and
  // exclude spans classified as 'other'.
  // Provides the tokens produced during tokenization of the context string for
  // reuse.
  bool ModelAnnotate(const std::string& context,
                     const std::vector<Locale>& detected_text_language_tags,
                     const AnnotationOptions& options,
                     InterpreterManager* interpreter_manager,
                     std::vector<Token>* tokens,
                     std::vector<AnnotatedSpan>* result) const;

  // Groups the tokens into chunks. A chunk is a token span that should be the
  // suggested selection when any of its contained tokens is clicked. The chunks
  // are non-overlapping and are sorted by their position in the context string.
  // "num_tokens" is the total number of tokens available (as this method does
  // not need the actual vector of tokens).
  // "span_of_interest" is a span of all the tokens that could be clicked.
  // The resulting chunks all have to overlap with it and they cover this span
  // completely. The first and last chunk might extend beyond it.
  // The chunks vector is cleared before filling.
  bool ModelChunk(int num_tokens, const TokenSpan& span_of_interest,
                  tflite::Interpreter* selection_interpreter,
                  const CachedFeatures& cached_features,
                  std::vector<TokenSpan>* chunks) const;

  // A helper method for ModelChunk(). It generates scored chunk candidates for
  // a click context model.
  // NOTE: The returned chunks can (and most likely do) overlap.
  bool ModelClickContextScoreChunks(
      int num_tokens, const TokenSpan& span_of_interest,
      const CachedFeatures& cached_features,
      tflite::Interpreter* selection_interpreter,
      std::vector<ScoredChunk>* scored_chunks) const;

  // A helper method for ModelChunk(). It generates scored chunk candidates for
  // a bounds-sensitive model.
  // NOTE: The returned chunks can (and most likely do) overlap.
  bool ModelBoundsSensitiveScoreChunks(
      int num_tokens, const TokenSpan& span_of_interest,
      const TokenSpan& inference_span, const CachedFeatures& cached_features,
      tflite::Interpreter* selection_interpreter,
      std::vector<ScoredChunk>* scored_chunks) const;

  // Produces chunks isolated by a set of regular expressions.
  bool RegexChunk(const UnicodeText& context_unicode,
                  const std::vector<int>& rules,
                  bool is_serialized_entity_data_enabled,
                  const EnabledEntityTypes& enabled_entity_types,
                  const AnnotationUsecase& annotation_usecase,

                  std::vector<AnnotatedSpan>* result) const;

  // Produces chunks from the datetime parser.
  bool DatetimeChunk(const UnicodeText& context_unicode,
                     int64 reference_time_ms_utc,
                     const std::string& reference_timezone,
                     const std::string& locales, ModeFlag mode,
                     AnnotationUsecase annotation_usecase,
                     bool is_serialized_entity_data_enabled,
                     std::vector<AnnotatedSpan>* result) const;

  // Returns whether a classification should be filtered.
  bool FilteredForAnnotation(const AnnotatedSpan& span) const;
  bool FilteredForClassification(
      const ClassificationResult& classification) const;
  bool FilteredForSelection(const AnnotatedSpan& span) const;

  // Computes the selection boundaries from a regular expression match.
  CodepointSpan ComputeSelectionBoundaries(
      const UniLib::RegexMatcher* match,
      const RegexModel_::Pattern* config) const;

  // Returns whether a regex pattern provides entity data from a match.
  bool HasEntityData(const RegexModel_::Pattern* pattern) const;

  // Constructs and serializes entity data from regex matches.
  bool SerializedEntityDataFromRegexMatch(
      const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,
      std::string* serialized_entity_data) const;

  // For knowledge candidates which have a ContactPointer, fill in the
  // appropriate contact metadata, if possible.
  void AddContactMetadataToKnowledgeClassificationResults(
      std::vector<AnnotatedSpan>* candidates) const;

  // Gets priority score from the list of classification results.
  float GetPriorityScore(
      const std::vector<ClassificationResult>& classification) const;

  // Verifies a regex match and returns true if verification was successful.
  bool VerifyRegexMatchCandidate(
      const std::string& context,
      const VerificationOptions* verification_options, const std::string& match,
      const UniLib::RegexMatcher* matcher) const;

  const Model* model_;

  std::unique_ptr<const ModelExecutor> selection_executor_;
  std::unique_ptr<const ModelExecutor> classification_executor_;
  std::unique_ptr<const EmbeddingExecutor> embedding_executor_;

  std::unique_ptr<const FeatureProcessor> selection_feature_processor_;
  std::unique_ptr<const FeatureProcessor> classification_feature_processor_;

  std::unique_ptr<const grammar::Analyzer> analyzer_;
  std::unique_ptr<const DatetimeGrounder> datetime_grounder_;
  std::unique_ptr<const DatetimeParser> datetime_parser_;
  std::unique_ptr<const GrammarAnnotator> grammar_annotator_;

  std::string owned_buffer_;
  std::unique_ptr<UniLib> owned_unilib_;
  std::unique_ptr<CalendarLib> owned_calendarlib_;

 private:
  struct CompiledRegexPattern {
    const RegexModel_::Pattern* config;
    std::unique_ptr<UniLib::RegexPattern> pattern;
  };

  // Removes annotations the entity type of which is not in the set of enabled
  // entity types.
  void RemoveNotEnabledEntityTypes(
      const EnabledEntityTypes& is_entity_type_enabled,
      std::vector<AnnotatedSpan>* annotated_spans) const;

  // Runs only annotators that do not support structured input. Does conflict
  // resolution, removal of disallowed entities and sorting on both new
  // generated candidates and passed in entities.
  // Returns Status::Error if the annotation failed, in which case the vector of
  // candidates should be ignored.
  Status AnnotateSingleInput(const std::string& context,
                             const AnnotationOptions& options,
                             std::vector<AnnotatedSpan>* candidates) const;

  // Parses the money amount into whole and decimal part and fills in the
  // entity data information.
  bool ParseAndFillInMoneyAmount(std::string* serialized_entity_data,
                                 const UniLib::RegexMatcher* match,
                                 const RegexModel_::Pattern* config,
                                 const UnicodeText& context_unicode) const;

  // Given the regex capturing groups, extract the one representing the money
  // quantity and fills in the actual string and the power of 10 the amount
  // should be multiplied with.
  void GetMoneyQuantityFromCapturingGroup(const UniLib::RegexMatcher* match,
                                          const RegexModel_::Pattern* config,
                                          const UnicodeText& context_unicode,
                                          std::string* quantity,
                                          int* exponent) const;

  // Returns true if any of the ff-model entity types is enabled.
  bool IsAnyModelEntityTypeEnabled(
      const EnabledEntityTypes& is_entity_type_enabled) const;

  // Returns true if any of the regex entity types is enabled.
  bool IsAnyRegexEntityTypeEnabled(
      const EnabledEntityTypes& is_entity_type_enabled) const;

  // Returns true if any of the POD NER entity types is enabled.
  bool IsAnyPodNerEntityTypeEnabled(
      const EnabledEntityTypes& is_entity_type_enabled) const;

  std::unique_ptr<ScopedMmap> mmap_;
  bool initialized_ = false;
  bool enabled_for_annotation_ = false;
  bool enabled_for_classification_ = false;
  bool enabled_for_selection_ = false;
  std::unordered_set<std::string> filtered_collections_annotation_;
  std::unordered_set<std::string> filtered_collections_classification_;
  std::unordered_set<std::string> filtered_collections_selection_;

  std::vector<CompiledRegexPattern> regex_patterns_;

  // Indices into regex_patterns_ for the different modes.
  std::vector<int> annotation_regex_patterns_, classification_regex_patterns_,
      selection_regex_patterns_;

  const UniLib* unilib_;
  const CalendarLib* calendarlib_;

  std::unique_ptr<const KnowledgeEngine> knowledge_engine_;
  std::unique_ptr<const ContactEngine> contact_engine_;
  std::unique_ptr<const InstalledAppEngine> installed_app_engine_;
  std::unique_ptr<const NumberAnnotator> number_annotator_;
  std::unique_ptr<const DurationAnnotator> duration_annotator_;
  std::unique_ptr<const PersonNameEngine> person_name_engine_;
  std::unique_ptr<const TranslateAnnotator> translate_annotator_;
  std::unique_ptr<const PodNerAnnotator> pod_ner_annotator_;
  std::unique_ptr<const ExperimentalAnnotator> experimental_annotator_;
  std::unique_ptr<const VocabAnnotator> vocab_annotator_;

  // Builder for creating extra data.
  const reflection::Schema* entity_data_schema_;
  std::unique_ptr<MutableFlatbufferBuilder> entity_data_builder_;

  // Locales for which the entire model triggers.
  std::vector<Locale> model_triggering_locales_;

  // Locales for which the ML model triggers.
  std::vector<Locale> ml_model_triggering_locales_;

  // Locales that the dictionary classification support.
  std::vector<Locale> dictionary_locales_;

  // Decimal and thousands number separators.
  std::unordered_set<char32> money_separators_;

  // Model for language identification.
  const libtextclassifier3::mobile::lang_id::LangId* lang_id_ = nullptr;

  // If true, will prioritize the longest annotation during conflict resolution.
  bool prioritize_longest_annotation_ = false;

  // If true, the annotator will perform conflict resolution between the
  // different sub-annotators also in the RAW mode. If false, no conflict
  // resolution will be performed in RAW mode.
  bool do_conflict_resolution_in_raw_mode_ = true;
};

namespace internal {

// Helper function, which if the initial 'span' contains only white-spaces,
// moves the selection to a single-codepoint selection on the left side
// of this block of white-space.
CodepointSpan SnapLeftIfWhitespaceSelection(const CodepointSpan& span,
                                            const UnicodeText& context_unicode,
                                            const UniLib& unilib);

// Copies tokens from 'cached_tokens' that are
// 'tokens_around_selection_to_copy' (on the left, and right) tokens distant
// from the tokens that correspond to 'selection_indices'.
std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,
                                    const CodepointSpan& selection_indices,
                                    TokenSpan tokens_around_selection_to_copy);
}  // namespace internal

// Interprets the buffer as a Model flatbuffer and returns it for reading.
const Model* ViewModel(const void* buffer, int size);

// Opens model from given path and runs a function, passing the loaded Model
// flatbuffer as an argument.
//
// This is mainly useful if we don't want to pay the cost for the model
// initialization because we'll be only reading some flatbuffer values from the
// file.
template <typename ReturnType, typename Func>
ReturnType VisitAnnotatorModel(const std::string& path, Func function) {
  ScopedMmap mmap(path);
  if (!mmap.handle().ok()) {
    function(/*model=*/nullptr);
  }
  const Model* model =
      ViewModel(mmap.handle().start(), mmap.handle().num_bytes());
  return function(model);
}

}  // namespace libtextclassifier3

#endif  // LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_