diff options
author | Tim Barron <tjbarron@google.com> | 2023-03-14 09:57:47 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2023-03-14 09:57:47 -0700 |
commit | c1e7edff54723138756063ee4b7948c1ee91277e (patch) | |
tree | b2a55e543a6c9396631feaab459bfd671a8bc400 /icing/query/advanced_query_parser/lexer.cc | |
parent | 140aaee3e7b269f02599310e42d6172090ce02d2 (diff) | |
parent | d5c81ae0c41ae9c1aefb3601f3836570b9f686c7 (diff) | |
download | icing-c1e7edff54723138756063ee4b7948c1ee91277e.tar.gz |
Merge remote-tracking branch 'goog/upstream-master' into androidx-platform-dev
* goog/upstream-master:
Update Icing from upstream.
Update Icing from upstream.
Descriptions:
========================================================================
Cache an instance of UBreakIterator to reduce unnecessary creations.
========================================================================
Cap number of individual IntegerIndexStorages that IntegerIndex creates.
========================================================================
Change error in trimRightMostNode from Unimplemented to InvalidArgument.
========================================================================
Add detection for new language features of List Filters Query Language.
========================================================================
Add option to control threshold to rebuild index during optimize by flag
========================================================================
Add option to control use of namespace id to build urimapper by flag.
========================================================================
Enforce schema validation for joinable config.
========================================================================
Adopt bucket splitting for IntegerIndexStorage.
========================================================================
Implement bucket splitting function.
========================================================================
Add Icing initialization unit tests for QualifiedIdTypeJoinableIndex.
========================================================================
Add Icing schema change unit tests for QualifiedIdTypeJoinableIndex.
========================================================================
Add Icing optimization unit tests for QualifiedIdTypeJoinableIndex.
========================================================================
Integrate QualifiedIdTypeJoinableIndex into IcingSearchEngine.
========================================================================
Implement QualifiedIdJoinablePropertyIndexingHandler.
========================================================================
Change QualifiedIdTypeJoinableIndex to store raw qualified id string.
========================================================================
Pass info about unnormalized query terms through lexer/parser/visitor.
========================================================================
Integrate Advanced Query w/ Suggest, make ADVANCED_QUERY default parser.
======================================================================
Bug: 208654892
Bug: 263890397
Bug: 259743562
Bug: 272145329
Bug: 227356108
Change-Id: I44de5853bb6c55b42800ae34d8071016be6c87cd
Diffstat (limited to 'icing/query/advanced_query_parser/lexer.cc')
-rw-r--r-- | icing/query/advanced_query_parser/lexer.cc | 71 |
1 files changed, 44 insertions, 27 deletions
diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc index 6cddd96..0dd0bb0 100644 --- a/icing/query/advanced_query_parser/lexer.cc +++ b/icing/query/advanced_query_parser/lexer.cc @@ -38,12 +38,13 @@ bool Lexer::ConsumeWhitespace() { } bool Lexer::ConsumeQuerySingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case ':': - tokens_.push_back({":", TokenType::COMPARATOR}); + tokens_.push_back({":", original_text, TokenType::COMPARATOR}); break; case '*': - tokens_.push_back({"", TokenType::STAR}); + tokens_.push_back({"", original_text, TokenType::STAR}); break; case '-': if (in_text_) { @@ -51,7 +52,7 @@ bool Lexer::ConsumeQuerySingleChar() { // in the middle of a TEXT segment (ex. `foo-bar`). return false; } - tokens_.push_back({"", TokenType::MINUS}); + tokens_.push_back({"", original_text, TokenType::MINUS}); break; default: return false; @@ -61,18 +62,19 @@ bool Lexer::ConsumeQuerySingleChar() { } bool Lexer::ConsumeScoringSingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case '+': - tokens_.push_back({"", TokenType::PLUS}); + tokens_.push_back({"", original_text, TokenType::PLUS}); break; case '*': - tokens_.push_back({"", TokenType::TIMES}); + tokens_.push_back({"", original_text, TokenType::TIMES}); break; case '/': - tokens_.push_back({"", TokenType::DIV}); + tokens_.push_back({"", original_text, TokenType::DIV}); break; case '-': - tokens_.push_back({"", TokenType::MINUS}); + tokens_.push_back({"", original_text, TokenType::MINUS}); break; default: return false; @@ -82,18 +84,19 @@ bool Lexer::ConsumeScoringSingleChar() { } bool Lexer::ConsumeGeneralSingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case ',': - tokens_.push_back({"", TokenType::COMMA}); + tokens_.push_back({"", original_text, TokenType::COMMA}); break; case '.': - tokens_.push_back({"", TokenType::DOT}); + tokens_.push_back({"", original_text, TokenType::DOT}); break; case '(': - tokens_.push_back({"", TokenType::LPAREN}); + tokens_.push_back({"", original_text, TokenType::LPAREN}); break; case ')': - tokens_.push_back({"", TokenType::RPAREN}); + tokens_.push_back({"", original_text, TokenType::RPAREN}); break; default: return false; @@ -124,13 +127,17 @@ bool Lexer::ConsumeComparator() { // Matching for '<=', '>=', '!=', or '=='. char next_char = PeekNext(1); if (next_char == '=') { - tokens_.push_back({{current_char_, next_char}, TokenType::COMPARATOR}); + tokens_.push_back({{current_char_, next_char}, + query_.substr(current_index_, 2), + TokenType::COMPARATOR}); Advance(2); return true; } // Now, next_char must not be '='. Let's match for '<' and '>'. if (current_char_ == '<' || current_char_ == '>') { - tokens_.push_back({{current_char_}, TokenType::COMPARATOR}); + tokens_.push_back({{current_char_}, + query_.substr(current_index_, 1), + TokenType::COMPARATOR}); Advance(); return true; } @@ -145,10 +152,11 @@ bool Lexer::ConsumeAndOr() { if (current_char_ != next_char) { return false; } + std::string_view original_text = query_.substr(current_index_, 2); if (current_char_ == '&') { - tokens_.push_back({"", TokenType::AND}); + tokens_.push_back({"", original_text, TokenType::AND}); } else { - tokens_.push_back({"", TokenType::OR}); + tokens_.push_back({"", original_text, TokenType::OR}); } Advance(2); return true; @@ -158,37 +166,42 @@ bool Lexer::ConsumeStringLiteral() { if (current_char_ != '"') { return false; } - std::string text; Advance(); + int32_t unnormalized_start_pos = current_index_; while (current_char_ != '\0' && current_char_ != '"') { // When getting a backslash, we will always match the next character, even // if the next character is a quotation mark if (current_char_ == '\\') { - text.push_back(current_char_); Advance(); if (current_char_ == '\0') { // In this case, we are missing a terminating quotation mark. break; } } - text.push_back(current_char_); Advance(); } if (current_char_ == '\0') { SyntaxError("missing terminating \" character"); return false; } - tokens_.push_back({text, TokenType::STRING}); + int32_t unnormalized_length = current_index_ - unnormalized_start_pos; + std::string_view raw_token_text = + query_.substr(unnormalized_start_pos, unnormalized_length); + std::string token_text(raw_token_text); + tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING}); Advance(); return true; } -bool Lexer::Text() { +bool Lexer::ConsumeText() { if (current_char_ == '\0') { return false; } - tokens_.push_back({"", TokenType::TEXT}); + tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT}); int token_index = tokens_.size() - 1; + + int32_t unnormalized_start_pos = current_index_; + int32_t unnormalized_end_pos = current_index_; while (!ConsumeNonText() && current_char_ != '\0') { in_text_ = true; // When getting a backslash in TEXT, unescape it by accepting its following @@ -203,14 +216,18 @@ bool Lexer::Text() { } tokens_[token_index].text.push_back(current_char_); Advance(); - if (current_char_ == '(') { - // A TEXT followed by a LPAREN is a FUNCTION_NAME. - tokens_.back().type = TokenType::FUNCTION_NAME; - // No need to break, since NonText() must be true at this point. - } + unnormalized_end_pos = current_index_; } in_text_ = false; + tokens_[token_index].original_text = query_.substr( + unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos); + if (unnormalized_end_pos < query_.length() && + query_[unnormalized_end_pos] == '(') { + // A TEXT followed by a LPAREN is a FUNCTION_NAME. + tokens_[token_index].type = TokenType::FUNCTION_NAME; + } + if (language_ == Lexer::Language::QUERY) { std::string &text = tokens_[token_index].text; TokenType &type = tokens_[token_index].type; @@ -234,7 +251,7 @@ Lexer::ExtractTokens() { // Clear out any non-text before matching a Text. while (ConsumeNonText()) { } - Text(); + ConsumeText(); } if (!error_.empty()) { return absl_ports::InvalidArgumentError( |