aboutsummaryrefslogtreecommitdiff
path: root/icing/query/advanced_query_parser/lexer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'icing/query/advanced_query_parser/lexer.cc')
-rw-r--r--icing/query/advanced_query_parser/lexer.cc71
1 files changed, 44 insertions, 27 deletions
diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc
index 6cddd96..0dd0bb0 100644
--- a/icing/query/advanced_query_parser/lexer.cc
+++ b/icing/query/advanced_query_parser/lexer.cc
@@ -38,12 +38,13 @@ bool Lexer::ConsumeWhitespace() {
}
bool Lexer::ConsumeQuerySingleChar() {
+ std::string_view original_text = query_.substr(current_index_, 1);
switch (current_char_) {
case ':':
- tokens_.push_back({":", TokenType::COMPARATOR});
+ tokens_.push_back({":", original_text, TokenType::COMPARATOR});
break;
case '*':
- tokens_.push_back({"", TokenType::STAR});
+ tokens_.push_back({"", original_text, TokenType::STAR});
break;
case '-':
if (in_text_) {
@@ -51,7 +52,7 @@ bool Lexer::ConsumeQuerySingleChar() {
// in the middle of a TEXT segment (ex. `foo-bar`).
return false;
}
- tokens_.push_back({"", TokenType::MINUS});
+ tokens_.push_back({"", original_text, TokenType::MINUS});
break;
default:
return false;
@@ -61,18 +62,19 @@ bool Lexer::ConsumeQuerySingleChar() {
}
bool Lexer::ConsumeScoringSingleChar() {
+ std::string_view original_text = query_.substr(current_index_, 1);
switch (current_char_) {
case '+':
- tokens_.push_back({"", TokenType::PLUS});
+ tokens_.push_back({"", original_text, TokenType::PLUS});
break;
case '*':
- tokens_.push_back({"", TokenType::TIMES});
+ tokens_.push_back({"", original_text, TokenType::TIMES});
break;
case '/':
- tokens_.push_back({"", TokenType::DIV});
+ tokens_.push_back({"", original_text, TokenType::DIV});
break;
case '-':
- tokens_.push_back({"", TokenType::MINUS});
+ tokens_.push_back({"", original_text, TokenType::MINUS});
break;
default:
return false;
@@ -82,18 +84,19 @@ bool Lexer::ConsumeScoringSingleChar() {
}
bool Lexer::ConsumeGeneralSingleChar() {
+ std::string_view original_text = query_.substr(current_index_, 1);
switch (current_char_) {
case ',':
- tokens_.push_back({"", TokenType::COMMA});
+ tokens_.push_back({"", original_text, TokenType::COMMA});
break;
case '.':
- tokens_.push_back({"", TokenType::DOT});
+ tokens_.push_back({"", original_text, TokenType::DOT});
break;
case '(':
- tokens_.push_back({"", TokenType::LPAREN});
+ tokens_.push_back({"", original_text, TokenType::LPAREN});
break;
case ')':
- tokens_.push_back({"", TokenType::RPAREN});
+ tokens_.push_back({"", original_text, TokenType::RPAREN});
break;
default:
return false;
@@ -124,13 +127,17 @@ bool Lexer::ConsumeComparator() {
// Matching for '<=', '>=', '!=', or '=='.
char next_char = PeekNext(1);
if (next_char == '=') {
- tokens_.push_back({{current_char_, next_char}, TokenType::COMPARATOR});
+ tokens_.push_back({{current_char_, next_char},
+ query_.substr(current_index_, 2),
+ TokenType::COMPARATOR});
Advance(2);
return true;
}
// Now, next_char must not be '='. Let's match for '<' and '>'.
if (current_char_ == '<' || current_char_ == '>') {
- tokens_.push_back({{current_char_}, TokenType::COMPARATOR});
+ tokens_.push_back({{current_char_},
+ query_.substr(current_index_, 1),
+ TokenType::COMPARATOR});
Advance();
return true;
}
@@ -145,10 +152,11 @@ bool Lexer::ConsumeAndOr() {
if (current_char_ != next_char) {
return false;
}
+ std::string_view original_text = query_.substr(current_index_, 2);
if (current_char_ == '&') {
- tokens_.push_back({"", TokenType::AND});
+ tokens_.push_back({"", original_text, TokenType::AND});
} else {
- tokens_.push_back({"", TokenType::OR});
+ tokens_.push_back({"", original_text, TokenType::OR});
}
Advance(2);
return true;
@@ -158,37 +166,42 @@ bool Lexer::ConsumeStringLiteral() {
if (current_char_ != '"') {
return false;
}
- std::string text;
Advance();
+ int32_t unnormalized_start_pos = current_index_;
while (current_char_ != '\0' && current_char_ != '"') {
// When getting a backslash, we will always match the next character, even
// if the next character is a quotation mark
if (current_char_ == '\\') {
- text.push_back(current_char_);
Advance();
if (current_char_ == '\0') {
// In this case, we are missing a terminating quotation mark.
break;
}
}
- text.push_back(current_char_);
Advance();
}
if (current_char_ == '\0') {
SyntaxError("missing terminating \" character");
return false;
}
- tokens_.push_back({text, TokenType::STRING});
+ int32_t unnormalized_length = current_index_ - unnormalized_start_pos;
+ std::string_view raw_token_text =
+ query_.substr(unnormalized_start_pos, unnormalized_length);
+ std::string token_text(raw_token_text);
+ tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING});
Advance();
return true;
}
-bool Lexer::Text() {
+bool Lexer::ConsumeText() {
if (current_char_ == '\0') {
return false;
}
- tokens_.push_back({"", TokenType::TEXT});
+ tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT});
int token_index = tokens_.size() - 1;
+
+ int32_t unnormalized_start_pos = current_index_;
+ int32_t unnormalized_end_pos = current_index_;
while (!ConsumeNonText() && current_char_ != '\0') {
in_text_ = true;
// When getting a backslash in TEXT, unescape it by accepting its following
@@ -203,14 +216,18 @@ bool Lexer::Text() {
}
tokens_[token_index].text.push_back(current_char_);
Advance();
- if (current_char_ == '(') {
- // A TEXT followed by a LPAREN is a FUNCTION_NAME.
- tokens_.back().type = TokenType::FUNCTION_NAME;
- // No need to break, since NonText() must be true at this point.
- }
+ unnormalized_end_pos = current_index_;
}
in_text_ = false;
+ tokens_[token_index].original_text = query_.substr(
+ unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos);
+ if (unnormalized_end_pos < query_.length() &&
+ query_[unnormalized_end_pos] == '(') {
+ // A TEXT followed by a LPAREN is a FUNCTION_NAME.
+ tokens_[token_index].type = TokenType::FUNCTION_NAME;
+ }
+
if (language_ == Lexer::Language::QUERY) {
std::string &text = tokens_[token_index].text;
TokenType &type = tokens_[token_index].type;
@@ -234,7 +251,7 @@ Lexer::ExtractTokens() {
// Clear out any non-text before matching a Text.
while (ConsumeNonText()) {
}
- Text();
+ ConsumeText();
}
if (!error_.empty()) {
return absl_ports::InvalidArgumentError(