5 files changed, 237 insertions, 58 deletions
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
index 1c9684d..0d812e4 100644
--- a/icing/result/result-retriever_test.cc
+++ b/icing/result/result-retriever_test.cc
@@ -22,7 +22,6 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
 #include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
@@ -36,6 +35,7 @@
 #include "icing/store/document-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/snippet-helpers.h"
 #include "icing/testing/test-data.h"
 #include "icing/testing/tmp-directory.h"
@@ -55,14 +55,14 @@ using ::testing::IsEmpty;
 using ::testing::Return;
 using ::testing::SizeIs;
 
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
-    PropertyConfigProto_Cardinality_Code_OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+    PropertyConfigProto::Cardinality::OPTIONAL;
 
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
-    StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+    StringIndexingConfig::TokenizerType::PLAIN;
 
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
 
 class ResultRetrieverTest : public testing::Test {
  protected:
@@ -160,7 +160,7 @@ ResultSpecProto::SnippetSpecProto CreateSnippetSpec() {
   ResultSpecProto::SnippetSpecProto snippet_spec;
   snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max());
   snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max());
-  snippet_spec.set_max_window_bytes(1024);
+  snippet_spec.set_max_window_utf32_length(1024);
   return snippet_spec;
 }
 
@@ -362,8 +362,8 @@ TEST_F(ResultRetrieverTest, NotIgnoreErrors) {
 
 TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) {
   MockFilesystem mock_filesystem;
-  ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false));
-
+  ON_CALL(mock_filesystem, PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>()))
+      .WillByDefault(Return(false));
   ICING_ASSERT_OK_AND_ASSIGN(
       DocumentStore::CreateResult create_result,
       DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_,
diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc
index 32e45aa..8a9005d 100644
--- a/icing/result/result-state-manager_test.cc
+++ b/icing/result/result-state-manager_test.cc
@@ -849,7 +849,7 @@ TEST_F(ResultStateManagerTest, ShouldGetSnippetContext) {
   ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
   result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
   result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
-  result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+  result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
 
   SearchSpecProto search_spec;
   search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
@@ -884,7 +884,7 @@ TEST_F(ResultStateManagerTest, ShouldGetDefaultSnippetContext) {
   // 0 indicates no snippeting
   result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
   result_spec.mutable_snippet_spec()->set_num_matches_per_property(0);
-  result_spec.mutable_snippet_spec()->set_max_window_bytes(0);
+  result_spec.mutable_snippet_spec()->set_max_window_utf32_length(0);
 
   SearchSpecProto search_spec;
   search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc
index f2121a5..d92fcfa 100644
--- a/icing/result/result-state_test.cc
+++ b/icing/result/result-state_test.cc
@@ -143,7 +143,7 @@ TEST_F(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) {
   ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
   result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
   result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
-  result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+  result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
 
   SectionRestrictQueryTermsMap query_terms_map;
   query_terms_map.emplace("term1", std::unordered_set<std::string>());
@@ -178,7 +178,7 @@ TEST_F(ResultStateTest, NoSnippetingShouldReturnNull) {
   // stored.
   result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
   result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
-  result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+  result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
 
   SectionRestrictQueryTermsMap query_terms_map;
   query_terms_map.emplace("term1", std::unordered_set<std::string>());
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index c46762e..bd1524e 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -41,6 +41,7 @@
 #include "icing/transform/normalizer.h"
 #include "icing/util/character-iterator.h"
 #include "icing/util/i18n-utils.h"
+#include "icing/util/logging.h"
 #include "icing/util/status-macros.h"
 
 namespace icing {
@@ -75,6 +76,67 @@ inline std::string AddIndexToPath(int values_size, int index,
                             kRBracket);
 }
 
+// Returns a string of the normalized text of the input Token. Normalization
+// is applied based on the Token's type.
+std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
+  switch (token.type) {
+    case Token::Type::REGULAR:
+      return normalizer.NormalizeTerm(token.text);
+    case Token::Type::VERBATIM:
+      return std::string(token.text);
+    case Token::Type::QUERY_EXCLUSION:
+      [[fallthrough]];
+    case Token::Type::QUERY_LEFT_PARENTHESES:
+      [[fallthrough]];
+    case Token::Type::QUERY_RIGHT_PARENTHESES:
+      [[fallthrough]];
+    case Token::Type::QUERY_OR:
+      [[fallthrough]];
+    case Token::Type::QUERY_PROPERTY:
+      [[fallthrough]];
+    case Token::Type::INVALID:
+      ICING_LOG(WARNING) << "Unable to normalize token of type: "
+                         << static_cast<int>(token.type);
+      return std::string(token.text);
+  }
+}
+
+// Returns a CharacterIterator for token's text, advancing one past the last
+// matching character from the query term.
+CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
+                               const std::string& match_query_term) {
+  switch (token.type) {
+    case Token::Type::VERBATIM: {
+      // VERBATIM tokens are not normalized. This means the non-normalized
+      // matched query term must be either equal to or a prefix of the token's
+      // text. Therefore, the match must end at the end of the matched query
+      // term.
+      CharacterIterator verbatim_match_end =
+          CharacterIterator(token.text, 0, 0, 0);
+      verbatim_match_end.AdvanceToUtf8(match_query_term.length());
+      return verbatim_match_end;
+    }
+    case Token::Type::QUERY_EXCLUSION:
+      [[fallthrough]];
+    case Token::Type::QUERY_LEFT_PARENTHESES:
+      [[fallthrough]];
+    case Token::Type::QUERY_RIGHT_PARENTHESES:
+      [[fallthrough]];
+    case Token::Type::QUERY_OR:
+      [[fallthrough]];
+    case Token::Type::QUERY_PROPERTY:
+      [[fallthrough]];
+    case Token::Type::INVALID:
+      ICING_LOG(WARNING)
+          << "Unexpected Token type " << static_cast<int>(token.type)
+          << " found when finding match end of query term and token.";
+      [[fallthrough]];
+    case Token::Type::REGULAR:
+      return normalizer.FindNormalizedMatchEndPosition(token.text,
+                                                       match_query_term);
+  }
+}
+
 class TokenMatcher {
  public:
   virtual ~TokenMatcher() = default;
@@ -102,15 +164,16 @@ class TokenMatcherExact : public TokenMatcher {
         normalizer_(normalizer) {}
 
   CharacterIterator Matches(Token token) const override {
-    std::string s = normalizer_.NormalizeTerm(token.text);
+    std::string s = NormalizeToken(normalizer_, token);
     auto itr = unrestricted_query_terms_.find(s);
     if (itr == unrestricted_query_terms_.end()) {
       itr = restricted_query_terms_.find(s);
     }
     if (itr != unrestricted_query_terms_.end() &&
         itr != restricted_query_terms_.end()) {
-      return normalizer_.FindNormalizedMatchEndPosition(token.text, *itr);
+      return FindMatchEnd(normalizer_, token, *itr);
     }
+
     return CharacterIterator(token.text, -1, -1, -1);
   }
 
@@ -131,19 +194,17 @@ class TokenMatcherPrefix : public TokenMatcher {
         normalizer_(normalizer) {}
 
   CharacterIterator Matches(Token token) const override {
-    std::string s = normalizer_.NormalizeTerm(token.text);
+    std::string s = NormalizeToken(normalizer_, token);
     for (const std::string& query_term : unrestricted_query_terms_) {
       if (query_term.length() <= s.length() &&
           s.compare(0, query_term.length(), query_term) == 0) {
-        return normalizer_.FindNormalizedMatchEndPosition(token.text,
-                                                          query_term);
+        return FindMatchEnd(normalizer_, token, query_term);
       }
     }
     for (const std::string& query_term : restricted_query_terms_) {
       if (query_term.length() <= s.length() &&
           s.compare(0, query_term.length(), query_term) == 0) {
-        return normalizer_.FindNormalizedMatchEndPosition(token.text,
-                                                          query_term);
+        return FindMatchEnd(normalizer_, token, query_term);
       }
     }
     return CharacterIterator(token.text, -1, -1, -1);
@@ -184,7 +245,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
     std::string_view value, int window_start_min_exclusive_utf32,
     Tokenizer::Iterator* iterator) {
-  if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) {
+  if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) {
     return absl_ports::InternalError(
         "Couldn't reset tokenizer to determine snippet window!");
   }
@@ -219,7 +280,7 @@ libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
     const ResultSpecProto::SnippetSpecProto& snippet_spec,
     std::string_view value, int window_end_max_exclusive_utf32,
     Tokenizer::Iterator* iterator) {
-  if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) {
+  if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) {
     return absl_ports::InternalError(
         "Couldn't reset tokenizer to determine snippet window!");
   }
@@ -283,9 +344,9 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
   int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
   int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
   int window_start_min_exclusive_utf32 =
-      (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1;
+      (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1;
   int window_end_max_exclusive_utf32 =
-      match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2;
+      match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2;
 
   snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
   snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
@@ -296,7 +357,7 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
 
   // Only include windows if it'll at least include the matched text. Otherwise,
   // it'll just be an empty string anyways.
-  if (snippet_spec.max_window_bytes() >= match_len_utf32) {
+  if (snippet_spec.max_window_utf32_length() >= match_len_utf32) {
     // Find the beginning of the window.
     ICING_ASSIGN_OR_RETURN(
         CharacterIterator window_start,
@@ -337,8 +398,13 @@ libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
 
     // DetermineWindowStart/End may change the position of the iterator. So,
     // reset the iterator back to the original position.
-    bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1)
-                                   : iterator->ResetToStart();
+    bool success = false;
+    if (match_pos_utf32 > 0) {
+      success = iterator->ResetToTokenStartingAfter(match_pos_utf32 - 1);
+    } else {
+      success = iterator->ResetToStart();
+    }
+
     if (!success) {
       return absl_ports::InternalError(
           "Couldn't reset tokenizer to determine snippet window!");
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index f811941..0de2295 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,7 +22,6 @@
 #include "gtest/gtest.h"
 #include "icing/document-builder.h"
 #include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
 #include "icing/portable/equals-proto.h"
 #include "icing/portable/platform.h"
 #include "icing/proto/document.pb.h"
@@ -37,6 +36,7 @@
 #include "icing/store/key-mapper.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
 #include "icing/testing/jni-test-helpers.h"
 #include "icing/testing/snippet-helpers.h"
 #include "icing/testing/test-data.h"
@@ -58,16 +58,18 @@ using ::testing::Eq;
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
 
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL =
-    PropertyConfigProto_Cardinality_Code_OPTIONAL;
-constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED =
-    PropertyConfigProto_Cardinality_Code_REPEATED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+    PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+    PropertyConfigProto::Cardinality::REPEATED;
 
-constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN =
-    StringIndexingConfig_TokenizerType_Code_PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+    StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
+    StringIndexingConfig::TokenizerType::VERBATIM;
 
-constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY;
-constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX;
+constexpr TermMatchType::Code MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code MATCH_PREFIX = TermMatchType::PREFIX;
 
 std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
   std::vector<std::string_view> paths;
@@ -131,7 +133,7 @@ class SnippetRetrieverTest : public testing::Test {
     snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
     snippet_spec_.set_num_matches_per_property(
         std::numeric_limits<int32_t>::max());
-    snippet_spec_.set_max_window_bytes(64);
+    snippet_spec_.set_max_window_utf32_length(64);
   }
 
   void TearDown() override {
@@ -178,7 +180,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
 
   // Window starts at the beginning of "three" and ends in the middle of
   // "three". len=4, orig_window= "thre"
-  snippet_spec_.set_max_window_bytes(4);
+  snippet_spec_.set_max_window_utf32_length(4);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -204,7 +206,7 @@ TEST_F(SnippetRetrieverTest,
 
   // Window starts at the beginning of "three" and at the exact end of
   // "three". len=5, orig_window= "three"
-  snippet_spec_.set_max_window_bytes(5);
+  snippet_spec_.set_max_window_utf32_length(5);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -230,7 +232,7 @@ TEST_F(SnippetRetrieverTest,
 
   // Window starts at the beginning of "four" and at the exact end of
   // "four". len=4, orig_window= "four"
-  snippet_spec_.set_max_window_bytes(4);
+  snippet_spec_.set_max_window_utf32_length(4);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -262,7 +264,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
   //   1. untrimmed, no-shifting window will be (2,17).
   //   2. trimmed, no-shifting window [4,13) "two three"
   //   3. trimmed, shifted window [4,18) "two three four"
-  snippet_spec_.set_max_window_bytes(14);
+  snippet_spec_.set_max_window_utf32_length(14);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -295,7 +297,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
   //   1. untrimmed, no-shifting window will be (1,18).
   //   2. trimmed, no-shifting window [4,18) "two three four"
   //   3. trimmed, shifted window [4,20) "two three four.."
-  snippet_spec_.set_max_window_bytes(16);
+  snippet_spec_.set_max_window_utf32_length(16);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -321,7 +323,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
 
   // Window ends in the middle of all the punctuation and window starts at 0.
   // len=20, orig_window="one two three four.."
-  snippet_spec_.set_max_window_bytes(20);
+  snippet_spec_.set_max_window_utf32_length(20);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -349,7 +351,7 @@ TEST_F(SnippetRetrieverTest,
 
   // Window ends in the middle of all the punctuation and window starts at 0.
   // len=26, orig_window="pside down in Australia¿"
-  snippet_spec_.set_max_window_bytes(24);
+  snippet_spec_.set_max_window_utf32_length(24);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -377,7 +379,7 @@ TEST_F(SnippetRetrieverTest,
 
   // Window ends in the middle of all the punctuation and window starts at 0.
   // len=26, orig_window="upside down in Australia¿ "
-  snippet_spec_.set_max_window_bytes(26);
+  snippet_spec_.set_max_window_utf32_length(26);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -410,7 +412,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
   //   1. untrimmed, no-shifting window will be (-2,21).
   //   2. trimmed, no-shifting window [0,21) "one two three four..."
   //   3. trimmed, shifted window [0,22) "one two three four...."
-  snippet_spec_.set_max_window_bytes(22);
+  snippet_spec_.set_max_window_utf32_length(22);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -436,7 +438,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
 
   // Window ends before "five" but after all the punctuation
   // len=26, orig_window="one two three four.... "
-  snippet_spec_.set_max_window_bytes(26);
+  snippet_spec_.set_max_window_utf32_length(26);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -469,7 +471,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
   //   1. untrimmed, no-shifting window will be ((-7,26).
   //   2. trimmed, no-shifting window [0,26) "one two three four...."
   //   3. trimmed, shifted window [0,27) "one two three four.... five"
-  snippet_spec_.set_max_window_bytes(32);
+  snippet_spec_.set_max_window_utf32_length(32);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -495,7 +497,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
 
   // Max window size equals the size of the value.
   // len=34, orig_window="one two three four.... five"
-  snippet_spec_.set_max_window_bytes(34);
+  snippet_spec_.set_max_window_utf32_length(34);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -521,7 +523,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
 
   // Max window size exceeds the size of the value.
   // len=36, orig_window="one two three four.... five"
-  snippet_spec_.set_max_window_bytes(36);
+  snippet_spec_.set_max_window_utf32_length(36);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -555,7 +557,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
   //   1. untrimmed, no-shifting window will be (-10,19).
   //   2. trimmed, no-shifting window [0,19) "one two three four."
   //   3. trimmed, shifted window [0,27) "one two three four.... five"
-  snippet_spec_.set_max_window_bytes(28);
+  snippet_spec_.set_max_window_utf32_length(28);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -589,7 +591,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
   //   1. untrimmed, no-shifting window will be (10,39).
   //   2. trimmed, no-shifting window [14,31) "four.... five six"
   //   3. trimmed, shifted window [4,31) "two three four.... five six"
-  snippet_spec_.set_max_window_bytes(28);
+  snippet_spec_.set_max_window_utf32_length(28);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -623,7 +625,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
   //   1. untrimmed, no-shifting window will be (-10,19).
   //   2. trimmed, no-shifting window [0, 19) "one two three four."
   //   3. trimmed, shifted window [0, 22) "one two three four...."
-  snippet_spec_.set_max_window_bytes(28);
+  snippet_spec_.set_max_window_utf32_length(28);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -657,7 +659,7 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
   //   1. untrimmed, no-shifting window will be (1,30).
   //   2. trimmed, no-shifting window [4, 22) "two three four...."
   //   3. trimmed, shifted window [0, 22) "one two three four...."
-  snippet_spec_.set_max_window_bytes(28);
+  snippet_spec_.set_max_window_utf32_length(28);
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
 
@@ -721,7 +723,7 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
           .AddStringProperty("body", "Only a fool would match this content.")
           .Build();
 
-  snippet_spec_.set_max_window_bytes(0);
+  snippet_spec_.set_max_window_utf32_length(0);
 
   SectionIdMask section_mask = 0b00000011;
   SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
@@ -1473,7 +1475,7 @@ TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
   //   1. untrimmed, no-shifting window will be (0,7).
   //   2. trimmed, no-shifting window [1, 6) "每天走路去".
   //   3. trimmed, shifted window [0, 6) "我每天走路去"
-  snippet_spec_.set_max_window_bytes(6);
+  snippet_spec_.set_max_window_utf32_length(6);
 
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
@@ -1572,7 +1574,7 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
   // UTF8 idx:       9   22
   // UTF16 idx:      5   12
   // UTF32 idx:      3   7
-  snippet_spec_.set_max_window_bytes(6);
+  snippet_spec_.set_max_window_utf32_length(6);
 
   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
@@ -1596,6 +1598,117 @@ TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
   EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
 }
 
+TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
+  SchemaProto schema =
+      SchemaBuilder()
+          .AddType(SchemaTypeConfigBuilder()
+                       .SetType("verbatimType")
+                       .AddProperty(PropertyConfigBuilder()
+                                        .SetName("verbatim")
+                                        .SetDataTypeString(MATCH_EXACT,
+                                                           TOKENIZER_VERBATIM)
+                                        .SetCardinality(CARDINALITY_REPEATED)))
+          .Build();
+  ICING_ASSERT_OK(schema_store_->SetSchema(
+      schema, /*ignore_errors_and_delete_documents=*/true));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      snippet_retriever_,
+      SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+                               normalizer_.get()));
+
+  DocumentProto document = DocumentBuilder()
+                               .SetKey("icing", "verbatim/1")
+                               .SetSchema("verbatimType")
+                               .AddStringProperty("verbatim", "Hello, world!")
+                               .Build();
+
+  SectionIdMask section_mask = 0b00000001;
+  SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}};
+
+  snippet_spec_.set_max_window_utf32_length(13);
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_EXACT, snippet_spec_, document, section_mask);
+
+  // There should only be one snippet entry and match, the verbatim token in its
+  // entirety.
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  ASSERT_THAT(entry->property_name(), "verbatim");
+
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+  // We expect the match to begin at position 0, and to span the entire token
+  // which contains 13 characters.
+  EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
+  EXPECT_THAT(match_proto.window_utf16_length(), Eq(13));
+
+  // We expect the submatch to begin at position 0 of the verbatim token and
+  // span the length of our query term "Hello, world!", which has utf-16 length
+  // of 13. The submatch length is equal to the window length as the query the
+  // snippet is retrieved with an exact term match.
+  EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
+  EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13));
+}
+
+TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
+  SchemaProto schema =
+      SchemaBuilder()
+          .AddType(SchemaTypeConfigBuilder()
+                       .SetType("verbatimType")
+                       .AddProperty(PropertyConfigBuilder()
+                                        .SetName("verbatim")
+                                        .SetDataTypeString(MATCH_PREFIX,
+                                                           TOKENIZER_VERBATIM)
+                                        .SetCardinality(CARDINALITY_REPEATED)))
+          .Build();
+  ICING_ASSERT_OK(schema_store_->SetSchema(
+      schema, /*ignore_errors_and_delete_documents=*/true));
+  ICING_ASSERT_OK_AND_ASSIGN(
+      snippet_retriever_,
+      SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+                               normalizer_.get()));
+
+  // String:     "我每天走路去上班。"
+  //              ^ ^  ^   ^^
+  // UTF8 idx:    0 3  9  15 18
+  // UTF16 idx:   0 1  3   5 6
+  // UTF32 idx:   0 1  3   5 6
+  // Breaks into segments: "我", "每天", "走路", "去", "上班"
+  std::string chinese_string = "我每天走路去上班。";
+  DocumentProto document = DocumentBuilder()
+                               .SetKey("icing", "verbatim/1")
+                               .SetSchema("verbatimType")
+                               .AddStringProperty("verbatim", chinese_string)
+                               .Build();
+
+  SectionIdMask section_mask = 0b00000001;
+  SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}};
+
+  snippet_spec_.set_max_window_utf32_length(9);
+  SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+      query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+  // There should only be one snippet entry and match, the verbatim token in its
+  // entirety.
+  ASSERT_THAT(snippet.entries(), SizeIs(1));
+
+  const SnippetProto::EntryProto* entry = &snippet.entries(0);
+  ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+  ASSERT_THAT(entry->property_name(), "verbatim");
+
+  const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+  // We expect the match to begin at position 0, and to span the entire token
+  // which has utf-16 length of 9.
+  EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
+  EXPECT_THAT(match_proto.window_utf16_length(), Eq(9));
+
+  // We expect the submatch to begin at position 0 of the verbatim token and
+  // span the length of our query term "我每", which has utf-16 length of 2.
+  EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
+  EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
+}
+
 }  // namespace
 
 }  // namespace lib