aboutsummaryrefslogtreecommitdiff
path: root/icing/tokenization/tokenizer.h
blob: b4f0c6eff57ea259775c9a67f133cb36a0f0da6e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef ICING_TOKENIZATION_TOKENIZER_H_
#define ICING_TOKENIZATION_TOKENIZER_H_

#include <cstdint>
#include <memory>
#include <string_view>

#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/tokenization/token.h"
#include "icing/util/character-iterator.h"

namespace icing {
namespace lib {

// A virtual class that all other tokenizers should inherit. It provides
// interfaces that allow callers to tokenize text. The return value could be an
// iterator or a list of tokens. Example usage:
//
// std::unique_ptr<Tokenizer> tokenizer = GetTokenizer();
// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter,
//                  tokenizer->Tokenize(text));
// ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
// tokenizer->TokenizeAll(text));
class Tokenizer {
 public:
  virtual ~Tokenizer() = default;

  enum Type {
    // Index tokenizers
    PLAIN,  // Used to tokenize plain text input

    // Query tokenizers
    RAW_QUERY,  // Used to tokenize raw queries
  };

  // An iterator helping to get tokens.
  // Example usage:
  //
  // while (iterator.Advance()) {
  //   const Token& token = iterator.GetToken();
  //   // Do something
  // }
  class Iterator {
   public:
    virtual ~Iterator() = default;

    // Advances to the next token. Returns false if it has reached the end.
    virtual bool Advance() = 0;

    // Returns the current token. It can be called only when Advance() returns
    // true, otherwise an invalid token could be returned.
    virtual Token GetToken() const = 0;

    virtual libtextclassifier3::StatusOr<CharacterIterator>
    CalculateTokenStart() {
      return absl_ports::UnimplementedError(
          "CalculateTokenStart is not implemented!");
    }

    virtual libtextclassifier3::StatusOr<CharacterIterator>
    CalculateTokenEndExclusive() {
      return absl_ports::UnimplementedError(
          "CalculateTokenEndExclusive is not implemented!");
    }

    // Sets the tokenizer to point at the first token that *starts* *after*
    // offset. Returns false if there are no valid tokens starting after
    // offset.
    // Ex.
    // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
    // iterator.ResetToTokenAfter(4);
    // // The first full token starting after position 4 (the 'b' in "bar") is
    // // "baz".
    // PrintToken(iterator.GetToken());  // prints "baz"
    virtual bool ResetToTokenAfter(int32_t offset) { return false; }

    // Sets the tokenizer to point at the first token that *ends* *before*
    // offset. Returns false if there are no valid tokens ending
    // before offset.
    // Ex.
    // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
    // iterator.ResetToTokenBefore(4);
    // // The first full token ending before position 4 (the 'b' in "bar") is
    // // "foo".
    // PrintToken(iterator.GetToken());  // prints "foo"
    virtual bool ResetToTokenBefore(int32_t offset) { return false; }

    virtual bool ResetToStart() { return false; }
  };

  // Tokenizes the input text. The input text should outlive the returned
  // iterator.
  //
  // Returns:
  //   A token iterator on success
  //   INVALID_ARGUMENT with error message if input text has a wrong syntax
  //                    according to implementations of different tokenizer
  //                    types.
  //   INTERNAL_ERROR if any other errors occur
  virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize(
      std::string_view text) const = 0;

  // Tokenizes and returns all tokens in the input text. The input text should
  // outlive the returned vector.
  //
  // Returns:
  //   A list of tokens on success
  //   INVALID_ARGUMENT with error message if input text has a wrong syntax
  //                    according to implementations of different tokenizer
  //                    types.
  //   INTERNAL_ERROR if any other errors occur
  virtual libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
      std::string_view text) const = 0;
};

}  // namespace lib
}  // namespace icing

#endif  // ICING_TOKENIZATION_TOKENIZER_H_