1 files changed, 44 insertions, 27 deletions
diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc
index 6cddd96..0dd0bb0 100644
--- a/icing/query/advanced_query_parser/lexer.cc
+++ b/icing/query/advanced_query_parser/lexer.cc
@@ -38,12 +38,13 @@ bool Lexer::ConsumeWhitespace() {
 }
 
 bool Lexer::ConsumeQuerySingleChar() {
+  std::string_view original_text = query_.substr(current_index_, 1);
   switch (current_char_) {
     case ':':
-      tokens_.push_back({":", TokenType::COMPARATOR});
+      tokens_.push_back({":", original_text, TokenType::COMPARATOR});
       break;
     case '*':
-      tokens_.push_back({"", TokenType::STAR});
+      tokens_.push_back({"", original_text, TokenType::STAR});
       break;
     case '-':
       if (in_text_) {
@@ -51,7 +52,7 @@ bool Lexer::ConsumeQuerySingleChar() {
         // in the middle of a TEXT segment (ex. `foo-bar`).
         return false;
       }
-      tokens_.push_back({"", TokenType::MINUS});
+      tokens_.push_back({"", original_text, TokenType::MINUS});
       break;
     default:
       return false;
@@ -61,18 +62,19 @@ bool Lexer::ConsumeQuerySingleChar() {
 }
 
 bool Lexer::ConsumeScoringSingleChar() {
+  std::string_view original_text = query_.substr(current_index_, 1);
   switch (current_char_) {
     case '+':
-      tokens_.push_back({"", TokenType::PLUS});
+      tokens_.push_back({"", original_text, TokenType::PLUS});
       break;
     case '*':
-      tokens_.push_back({"", TokenType::TIMES});
+      tokens_.push_back({"", original_text, TokenType::TIMES});
       break;
     case '/':
-      tokens_.push_back({"", TokenType::DIV});
+      tokens_.push_back({"", original_text, TokenType::DIV});
       break;
     case '-':
-      tokens_.push_back({"", TokenType::MINUS});
+      tokens_.push_back({"", original_text, TokenType::MINUS});
       break;
     default:
       return false;
@@ -82,18 +84,19 @@ bool Lexer::ConsumeScoringSingleChar() {
 }
 
 bool Lexer::ConsumeGeneralSingleChar() {
+  std::string_view original_text = query_.substr(current_index_, 1);
   switch (current_char_) {
     case ',':
-      tokens_.push_back({"", TokenType::COMMA});
+      tokens_.push_back({"", original_text, TokenType::COMMA});
       break;
     case '.':
-      tokens_.push_back({"", TokenType::DOT});
+      tokens_.push_back({"", original_text, TokenType::DOT});
       break;
     case '(':
-      tokens_.push_back({"", TokenType::LPAREN});
+      tokens_.push_back({"", original_text, TokenType::LPAREN});
       break;
     case ')':
-      tokens_.push_back({"", TokenType::RPAREN});
+      tokens_.push_back({"", original_text, TokenType::RPAREN});
       break;
     default:
       return false;
@@ -124,13 +127,17 @@ bool Lexer::ConsumeComparator() {
   // Matching for '<=', '>=', '!=', or '=='.
   char next_char = PeekNext(1);
   if (next_char == '=') {
-    tokens_.push_back({{current_char_, next_char}, TokenType::COMPARATOR});
+    tokens_.push_back({{current_char_, next_char},
+                       query_.substr(current_index_, 2),
+                       TokenType::COMPARATOR});
     Advance(2);
     return true;
   }
   // Now, next_char must not be '='. Let's match for '<' and '>'.
   if (current_char_ == '<' || current_char_ == '>') {
-    tokens_.push_back({{current_char_}, TokenType::COMPARATOR});
+    tokens_.push_back({{current_char_},
+                       query_.substr(current_index_, 1),
+                       TokenType::COMPARATOR});
     Advance();
     return true;
   }
@@ -145,10 +152,11 @@ bool Lexer::ConsumeAndOr() {
   if (current_char_ != next_char) {
     return false;
   }
+  std::string_view original_text = query_.substr(current_index_, 2);
   if (current_char_ == '&') {
-    tokens_.push_back({"", TokenType::AND});
+    tokens_.push_back({"", original_text, TokenType::AND});
   } else {
-    tokens_.push_back({"", TokenType::OR});
+    tokens_.push_back({"", original_text, TokenType::OR});
   }
   Advance(2);
   return true;
@@ -158,37 +166,42 @@ bool Lexer::ConsumeStringLiteral() {
   if (current_char_ != '"') {
     return false;
   }
-  std::string text;
   Advance();
+  int32_t unnormalized_start_pos = current_index_;
   while (current_char_ != '\0' && current_char_ != '"') {
     // When getting a backslash, we will always match the next character, even
     // if the next character is a quotation mark
     if (current_char_ == '\\') {
-      text.push_back(current_char_);
       Advance();
       if (current_char_ == '\0') {
         // In this case, we are missing a terminating quotation mark.
         break;
       }
     }
-    text.push_back(current_char_);
     Advance();
   }
   if (current_char_ == '\0') {
     SyntaxError("missing terminating \" character");
     return false;
   }
-  tokens_.push_back({text, TokenType::STRING});
+  int32_t unnormalized_length = current_index_ - unnormalized_start_pos;
+  std::string_view raw_token_text =
+      query_.substr(unnormalized_start_pos, unnormalized_length);
+  std::string token_text(raw_token_text);
+  tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING});
   Advance();
   return true;
 }
 
-bool Lexer::Text() {
+bool Lexer::ConsumeText() {
   if (current_char_ == '\0') {
     return false;
   }
-  tokens_.push_back({"", TokenType::TEXT});
+  tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT});
   int token_index = tokens_.size() - 1;
+
+  int32_t unnormalized_start_pos = current_index_;
+  int32_t unnormalized_end_pos = current_index_;
   while (!ConsumeNonText() && current_char_ != '\0') {
     in_text_ = true;
     // When getting a backslash in TEXT, unescape it by accepting its following
@@ -203,14 +216,18 @@ bool Lexer::Text() {
     }
     tokens_[token_index].text.push_back(current_char_);
     Advance();
-    if (current_char_ == '(') {
-      // A TEXT followed by a LPAREN is a FUNCTION_NAME.
-      tokens_.back().type = TokenType::FUNCTION_NAME;
-      // No need to break, since NonText() must be true at this point.
-    }
+    unnormalized_end_pos = current_index_;
   }
   in_text_ = false;
 
+  tokens_[token_index].original_text = query_.substr(
+      unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos);
+  if (unnormalized_end_pos < query_.length() &&
+      query_[unnormalized_end_pos] == '(') {
+    // A TEXT followed by a LPAREN is a FUNCTION_NAME.
+    tokens_[token_index].type = TokenType::FUNCTION_NAME;
+  }
+
   if (language_ == Lexer::Language::QUERY) {
     std::string &text = tokens_[token_index].text;
     TokenType &type = tokens_[token_index].type;
@@ -234,7 +251,7 @@ Lexer::ExtractTokens() {
     // Clear out any non-text before matching a Text.
     while (ConsumeNonText()) {
     }
-    Text();
+    ConsumeText();
   }
   if (!error_.empty()) {
     return absl_ports::InvalidArgumentError(