diff options
Diffstat (limited to 'icing/tokenization/tokenizer.h')
-rw-r--r-- | icing/tokenization/tokenizer.h | 21 |
1 files changed, 15 insertions, 6 deletions
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h index fb7613f..3336266 100644 --- a/icing/tokenization/tokenizer.h +++ b/icing/tokenization/tokenizer.h @@ -22,6 +22,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" +#include "icing/tokenization/language-segmenter.h" #include "icing/tokenization/token.h" #include "icing/util/character-iterator.h" @@ -33,8 +34,10 @@ namespace lib { // iterator or a list of tokens. Example usage: // // std::unique_ptr<Tokenizer> tokenizer = GetTokenizer(); -// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter, -// tokenizer->Tokenize(text)); +// ICING_ASSIGN_OR_RETURN( +// std::unique_ptr<Tokenizer::Iterator> iter, +// tokenizer->Tokenize(text, +// LanguageSegmenter::AccessType::kForwardIterator)); // ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens, // tokenizer->TokenizeAll(text)); class Tokenizer { @@ -76,7 +79,10 @@ class Tokenizer { // offset. Returns false if there are no valid tokens starting after // offset. // Ex. - // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); + // auto iterator = + // tokenizer.Tokenize("foo bar baz", + // LanguageSegmenter::AccessType::kForwardIterator) + // .ValueOrDie(); // iterator.ResetToTokenStartingAfter(4); // // The first full token starting after position 4 (the 'b' in "bar") is // // "baz". @@ -89,8 +95,10 @@ class Tokenizer { // offset. Returns false if there are no valid tokens ending // before offset. // Ex. - // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); - // iterator.ResetToTokenEndingBefore(4); + // auto iterator = + // tokenizer.Tokenize("foo bar baz", + // LanguageSegmenter::AccessType::kForwardIterator) + // .ValueOrDie(); // iterator.ResetToTokenEndingBefore(4); // // The first full token ending before position 4 (the 'b' in "bar") is // // "foo". // PrintToken(iterator.GetToken()); // prints "foo" @@ -111,7 +119,8 @@ class Tokenizer { // types. // INTERNAL_ERROR if any other errors occur virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize( - std::string_view text) const = 0; + std::string_view text, + LanguageSegmenter::AccessType access_type) const = 0; // Tokenizes and returns all tokens in the input text. The input text should // outlive the returned vector. |