aboutsummaryrefslogtreecommitdiff
path: root/icing/tokenization/tokenizer.h
diff options
context:
space:
mode:
Diffstat (limited to 'icing/tokenization/tokenizer.h')
-rw-r--r--icing/tokenization/tokenizer.h21
1 files changed, 15 insertions, 6 deletions
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index fb7613f..3336266 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -22,6 +22,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/util/character-iterator.h"
@@ -33,8 +34,10 @@ namespace lib {
// iterator or a list of tokens. Example usage:
//
// std::unique_ptr<Tokenizer> tokenizer = GetTokenizer();
-// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter,
-// tokenizer->Tokenize(text));
+// ICING_ASSIGN_OR_RETURN(
+// std::unique_ptr<Tokenizer::Iterator> iter,
+// tokenizer->Tokenize(text,
+// LanguageSegmenter::AccessType::kForwardIterator));
// ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
// tokenizer->TokenizeAll(text));
class Tokenizer {
@@ -76,7 +79,10 @@ class Tokenizer {
// offset. Returns false if there are no valid tokens starting after
// offset.
// Ex.
- // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
+ // auto iterator =
+ // tokenizer.Tokenize("foo bar baz",
+ // LanguageSegmenter::AccessType::kForwardIterator)
+ // .ValueOrDie();
// iterator.ResetToTokenStartingAfter(4);
// // The first full token starting after position 4 (the 'b' in "bar") is
// // "baz".
@@ -89,8 +95,10 @@ class Tokenizer {
// offset. Returns false if there are no valid tokens ending
// before offset.
// Ex.
- // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
- // iterator.ResetToTokenEndingBefore(4);
+ // auto iterator =
+ // tokenizer.Tokenize("foo bar baz",
+ // LanguageSegmenter::AccessType::kForwardIterator)
+ // .ValueOrDie(); // iterator.ResetToTokenEndingBefore(4);
// // The first full token ending before position 4 (the 'b' in "bar") is
// // "foo".
// PrintToken(iterator.GetToken()); // prints "foo"
@@ -111,7 +119,8 @@ class Tokenizer {
// types.
// INTERNAL_ERROR if any other errors occur
virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize(
- std::string_view text) const = 0;
+ std::string_view text,
+ LanguageSegmenter::AccessType access_type) const = 0;
// Tokenizes and returns all tokens in the input text. The input text should
// outlive the returned vector.