diff options
Diffstat (limited to 'icing/query/advanced_query_parser/lexer.cc')
-rw-r--r-- | icing/query/advanced_query_parser/lexer.cc | 71 |
1 files changed, 44 insertions, 27 deletions
diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc index 6cddd96..0dd0bb0 100644 --- a/icing/query/advanced_query_parser/lexer.cc +++ b/icing/query/advanced_query_parser/lexer.cc @@ -38,12 +38,13 @@ bool Lexer::ConsumeWhitespace() { } bool Lexer::ConsumeQuerySingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case ':': - tokens_.push_back({":", TokenType::COMPARATOR}); + tokens_.push_back({":", original_text, TokenType::COMPARATOR}); break; case '*': - tokens_.push_back({"", TokenType::STAR}); + tokens_.push_back({"", original_text, TokenType::STAR}); break; case '-': if (in_text_) { @@ -51,7 +52,7 @@ bool Lexer::ConsumeQuerySingleChar() { // in the middle of a TEXT segment (ex. `foo-bar`). return false; } - tokens_.push_back({"", TokenType::MINUS}); + tokens_.push_back({"", original_text, TokenType::MINUS}); break; default: return false; @@ -61,18 +62,19 @@ bool Lexer::ConsumeQuerySingleChar() { } bool Lexer::ConsumeScoringSingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case '+': - tokens_.push_back({"", TokenType::PLUS}); + tokens_.push_back({"", original_text, TokenType::PLUS}); break; case '*': - tokens_.push_back({"", TokenType::TIMES}); + tokens_.push_back({"", original_text, TokenType::TIMES}); break; case '/': - tokens_.push_back({"", TokenType::DIV}); + tokens_.push_back({"", original_text, TokenType::DIV}); break; case '-': - tokens_.push_back({"", TokenType::MINUS}); + tokens_.push_back({"", original_text, TokenType::MINUS}); break; default: return false; @@ -82,18 +84,19 @@ bool Lexer::ConsumeScoringSingleChar() { } bool Lexer::ConsumeGeneralSingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case ',': - tokens_.push_back({"", TokenType::COMMA}); + tokens_.push_back({"", original_text, TokenType::COMMA}); break; case '.': - tokens_.push_back({"", TokenType::DOT}); + tokens_.push_back({"", original_text, TokenType::DOT}); break; case '(': - tokens_.push_back({"", TokenType::LPAREN}); + tokens_.push_back({"", original_text, TokenType::LPAREN}); break; case ')': - tokens_.push_back({"", TokenType::RPAREN}); + tokens_.push_back({"", original_text, TokenType::RPAREN}); break; default: return false; @@ -124,13 +127,17 @@ bool Lexer::ConsumeComparator() { // Matching for '<=', '>=', '!=', or '=='. char next_char = PeekNext(1); if (next_char == '=') { - tokens_.push_back({{current_char_, next_char}, TokenType::COMPARATOR}); + tokens_.push_back({{current_char_, next_char}, + query_.substr(current_index_, 2), + TokenType::COMPARATOR}); Advance(2); return true; } // Now, next_char must not be '='. Let's match for '<' and '>'. if (current_char_ == '<' || current_char_ == '>') { - tokens_.push_back({{current_char_}, TokenType::COMPARATOR}); + tokens_.push_back({{current_char_}, + query_.substr(current_index_, 1), + TokenType::COMPARATOR}); Advance(); return true; } @@ -145,10 +152,11 @@ bool Lexer::ConsumeAndOr() { if (current_char_ != next_char) { return false; } + std::string_view original_text = query_.substr(current_index_, 2); if (current_char_ == '&') { - tokens_.push_back({"", TokenType::AND}); + tokens_.push_back({"", original_text, TokenType::AND}); } else { - tokens_.push_back({"", TokenType::OR}); + tokens_.push_back({"", original_text, TokenType::OR}); } Advance(2); return true; @@ -158,37 +166,42 @@ bool Lexer::ConsumeStringLiteral() { if (current_char_ != '"') { return false; } - std::string text; Advance(); + int32_t unnormalized_start_pos = current_index_; while (current_char_ != '\0' && current_char_ != '"') { // When getting a backslash, we will always match the next character, even // if the next character is a quotation mark if (current_char_ == '\\') { - text.push_back(current_char_); Advance(); if (current_char_ == '\0') { // In this case, we are missing a terminating quotation mark. break; } } - text.push_back(current_char_); Advance(); } if (current_char_ == '\0') { SyntaxError("missing terminating \" character"); return false; } - tokens_.push_back({text, TokenType::STRING}); + int32_t unnormalized_length = current_index_ - unnormalized_start_pos; + std::string_view raw_token_text = + query_.substr(unnormalized_start_pos, unnormalized_length); + std::string token_text(raw_token_text); + tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING}); Advance(); return true; } -bool Lexer::Text() { +bool Lexer::ConsumeText() { if (current_char_ == '\0') { return false; } - tokens_.push_back({"", TokenType::TEXT}); + tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT}); int token_index = tokens_.size() - 1; + + int32_t unnormalized_start_pos = current_index_; + int32_t unnormalized_end_pos = current_index_; while (!ConsumeNonText() && current_char_ != '\0') { in_text_ = true; // When getting a backslash in TEXT, unescape it by accepting its following @@ -203,14 +216,18 @@ bool Lexer::Text() { } tokens_[token_index].text.push_back(current_char_); Advance(); - if (current_char_ == '(') { - // A TEXT followed by a LPAREN is a FUNCTION_NAME. - tokens_.back().type = TokenType::FUNCTION_NAME; - // No need to break, since NonText() must be true at this point. - } + unnormalized_end_pos = current_index_; } in_text_ = false; + tokens_[token_index].original_text = query_.substr( + unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos); + if (unnormalized_end_pos < query_.length() && + query_[unnormalized_end_pos] == '(') { + // A TEXT followed by a LPAREN is a FUNCTION_NAME. + tokens_[token_index].type = TokenType::FUNCTION_NAME; + } + if (language_ == Lexer::Language::QUERY) { std::string &text = tokens_[token_index].text; TokenType &type = tokens_[token_index].type; @@ -234,7 +251,7 @@ Lexer::ExtractTokens() { // Clear out any non-text before matching a Text. while (ConsumeNonText()) { } - Text(); + ConsumeText(); } if (!error_.empty()) { return absl_ports::InvalidArgumentError( |