Merge remote-tracking branch 'goog/upstream-master' into androidx-platform-dev

* goog/upstream-master: Update Icing from upstream. Update Icing from upstream. Descriptions: ======================================================================== Cache an instance of UBreakIterator to reduce unnecessary creations. ======================================================================== Cap number of individual IntegerIndexStorages that IntegerIndex creates. ======================================================================== Change error in trimRightMostNode from Unimplemented to InvalidArgument. ======================================================================== Add detection for new language features of List Filters Query Language. ======================================================================== Add option to control threshold to rebuild index during optimize by flag ======================================================================== Add option to control use of namespace id to build urimapper by flag. ======================================================================== Enforce schema validation for joinable config. ======================================================================== Adopt bucket splitting for IntegerIndexStorage. ======================================================================== Implement bucket splitting function. ======================================================================== Add Icing initialization unit tests for QualifiedIdTypeJoinableIndex. ======================================================================== Add Icing schema change unit tests for QualifiedIdTypeJoinableIndex. ======================================================================== Add Icing optimization unit tests for QualifiedIdTypeJoinableIndex. ======================================================================== Integrate QualifiedIdTypeJoinableIndex into IcingSearchEngine. ======================================================================== Implement QualifiedIdJoinablePropertyIndexingHandler. ======================================================================== Change QualifiedIdTypeJoinableIndex to store raw qualified id string. ======================================================================== Pass info about unnormalized query terms through lexer/parser/visitor. ======================================================================== Integrate Advanced Query w/ Suggest, make ADVANCED_QUERY default parser. ====================================================================== Bug: 208654892 Bug: 263890397 Bug: 259743562 Bug: 272145329 Bug: 227356108 Change-Id: I44de5853bb6c55b42800ae34d8071016be6c87cd
author: Tim Barron <tjbarron@google.com> 2023-03-14 09:57:47 -0700
committer: Tim Barron <tjbarron@google.com> 2023-03-14 09:57:47 -0700
commit: c1e7edff54723138756063ee4b7948c1ee91277e (patch)
tree: b2a55e543a6c9396631feaab459bfd671a8bc400 /icing/query/advanced_query_parser/lexer.cc
parent: 140aaee3e7b269f02599310e42d6172090ce02d2 (diff)
parent: d5c81ae0c41ae9c1aefb3601f3836570b9f686c7 (diff)
download: icing-c1e7edff54723138756063ee4b7948c1ee91277e.tar.gz
1 files changed, 44 insertions, 27 deletions
diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc
index 6cddd96..0dd0bb0 100644
--- a/icing/query/advanced_query_parser/lexer.cc
+++ b/icing/query/advanced_query_parser/lexer.cc
@@ -38,12 +38,13 @@ bool Lexer::ConsumeWhitespace() {
 }
 
 bool Lexer::ConsumeQuerySingleChar() {
+  std::string_view original_text = query_.substr(current_index_, 1);
   switch (current_char_) {
     case ':':
-      tokens_.push_back({":", TokenType::COMPARATOR});
+      tokens_.push_back({":", original_text, TokenType::COMPARATOR});
       break;
     case '*':
-      tokens_.push_back({"", TokenType::STAR});
+      tokens_.push_back({"", original_text, TokenType::STAR});
       break;
     case '-':
       if (in_text_) {
@@ -51,7 +52,7 @@ bool Lexer::ConsumeQuerySingleChar() {
         // in the middle of a TEXT segment (ex. `foo-bar`).
         return false;
       }
-      tokens_.push_back({"", TokenType::MINUS});
+      tokens_.push_back({"", original_text, TokenType::MINUS});
       break;
     default:
       return false;
@@ -61,18 +62,19 @@ bool Lexer::ConsumeQuerySingleChar() {
 }
 
 bool Lexer::ConsumeScoringSingleChar() {
+  std::string_view original_text = query_.substr(current_index_, 1);
   switch (current_char_) {
     case '+':
-      tokens_.push_back({"", TokenType::PLUS});
+      tokens_.push_back({"", original_text, TokenType::PLUS});
       break;
     case '*':
-      tokens_.push_back({"", TokenType::TIMES});
+      tokens_.push_back({"", original_text, TokenType::TIMES});
       break;
     case '/':
-      tokens_.push_back({"", TokenType::DIV});
+      tokens_.push_back({"", original_text, TokenType::DIV});
       break;
     case '-':
-      tokens_.push_back({"", TokenType::MINUS});
+      tokens_.push_back({"", original_text, TokenType::MINUS});
       break;
     default:
       return false;
@@ -82,18 +84,19 @@ bool Lexer::ConsumeScoringSingleChar() {
 }
 
 bool Lexer::ConsumeGeneralSingleChar() {
+  std::string_view original_text = query_.substr(current_index_, 1);
   switch (current_char_) {
     case ',':
-      tokens_.push_back({"", TokenType::COMMA});
+      tokens_.push_back({"", original_text, TokenType::COMMA});
       break;
     case '.':
-      tokens_.push_back({"", TokenType::DOT});
+      tokens_.push_back({"", original_text, TokenType::DOT});
       break;
     case '(':
-      tokens_.push_back({"", TokenType::LPAREN});
+      tokens_.push_back({"", original_text, TokenType::LPAREN});
       break;
     case ')':
-      tokens_.push_back({"", TokenType::RPAREN});
+      tokens_.push_back({"", original_text, TokenType::RPAREN});
       break;
     default:
       return false;
@@ -124,13 +127,17 @@ bool Lexer::ConsumeComparator() {
   // Matching for '<=', '>=', '!=', or '=='.
   char next_char = PeekNext(1);
   if (next_char == '=') {
-    tokens_.push_back({{current_char_, next_char}, TokenType::COMPARATOR});
+    tokens_.push_back({{current_char_, next_char},
+                       query_.substr(current_index_, 2),
+                       TokenType::COMPARATOR});
     Advance(2);
     return true;
   }
   // Now, next_char must not be '='. Let's match for '<' and '>'.
   if (current_char_ == '<' || current_char_ == '>') {
-    tokens_.push_back({{current_char_}, TokenType::COMPARATOR});
+    tokens_.push_back({{current_char_},
+                       query_.substr(current_index_, 1),
+                       TokenType::COMPARATOR});
     Advance();
     return true;
   }
@@ -145,10 +152,11 @@ bool Lexer::ConsumeAndOr() {
   if (current_char_ != next_char) {
     return false;
   }
+  std::string_view original_text = query_.substr(current_index_, 2);
   if (current_char_ == '&') {
-    tokens_.push_back({"", TokenType::AND});
+    tokens_.push_back({"", original_text, TokenType::AND});
   } else {
-    tokens_.push_back({"", TokenType::OR});
+    tokens_.push_back({"", original_text, TokenType::OR});
   }
   Advance(2);
   return true;
@@ -158,37 +166,42 @@ bool Lexer::ConsumeStringLiteral() {
   if (current_char_ != '"') {
     return false;
   }
-  std::string text;
   Advance();
+  int32_t unnormalized_start_pos = current_index_;
   while (current_char_ != '\0' && current_char_ != '"') {
     // When getting a backslash, we will always match the next character, even
     // if the next character is a quotation mark
     if (current_char_ == '\\') {
-      text.push_back(current_char_);
       Advance();
       if (current_char_ == '\0') {
         // In this case, we are missing a terminating quotation mark.
         break;
       }
     }
-    text.push_back(current_char_);
     Advance();
   }
   if (current_char_ == '\0') {
     SyntaxError("missing terminating \" character");
     return false;
   }
-  tokens_.push_back({text, TokenType::STRING});
+  int32_t unnormalized_length = current_index_ - unnormalized_start_pos;
+  std::string_view raw_token_text =
+      query_.substr(unnormalized_start_pos, unnormalized_length);
+  std::string token_text(raw_token_text);
+  tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING});
   Advance();
   return true;
 }
 
-bool Lexer::Text() {
+bool Lexer::ConsumeText() {
   if (current_char_ == '\0') {
     return false;
   }
-  tokens_.push_back({"", TokenType::TEXT});
+  tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT});
   int token_index = tokens_.size() - 1;
+
+  int32_t unnormalized_start_pos = current_index_;
+  int32_t unnormalized_end_pos = current_index_;
   while (!ConsumeNonText() && current_char_ != '\0') {
     in_text_ = true;
     // When getting a backslash in TEXT, unescape it by accepting its following
@@ -203,14 +216,18 @@ bool Lexer::Text() {
     }
     tokens_[token_index].text.push_back(current_char_);
     Advance();
-    if (current_char_ == '(') {
-      // A TEXT followed by a LPAREN is a FUNCTION_NAME.
-      tokens_.back().type = TokenType::FUNCTION_NAME;
-      // No need to break, since NonText() must be true at this point.
-    }
+    unnormalized_end_pos = current_index_;
   }
   in_text_ = false;
 
+  tokens_[token_index].original_text = query_.substr(
+      unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos);
+  if (unnormalized_end_pos < query_.length() &&
+      query_[unnormalized_end_pos] == '(') {
+    // A TEXT followed by a LPAREN is a FUNCTION_NAME.
+    tokens_[token_index].type = TokenType::FUNCTION_NAME;
+  }
+
   if (language_ == Lexer::Language::QUERY) {
     std::string &text = tokens_[token_index].text;
     TokenType &type = tokens_[token_index].type;
@@ -234,7 +251,7 @@ Lexer::ExtractTokens() {
     // Clear out any non-text before matching a Text.
     while (ConsumeNonText()) {
     }
-    Text();
+    ConsumeText();
   }
   if (!error_.empty()) {
     return absl_ports::InvalidArgumentError(
author	Tim Barron <tjbarron@google.com>	2023-03-14 09:57:47 -0700
committer	Tim Barron <tjbarron@google.com>	2023-03-14 09:57:47 -0700
commit	c1e7edff54723138756063ee4b7948c1ee91277e (patch)
tree	b2a55e543a6c9396631feaab459bfd671a8bc400 /icing/query/advanced_query_parser/lexer.cc
parent	140aaee3e7b269f02599310e42d6172090ce02d2 (diff)
parent	d5c81ae0c41ae9c1aefb3601f3836570b9f686c7 (diff)
download	icing-c1e7edff54723138756063ee4b7948c1ee91277e.tar.gz