// Copyright (C) 2022 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "icing/query/advanced_query_parser/parser.h" #include #include #include "icing/absl_ports/canonical_errors.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/query/advanced_query_parser/abstract-syntax-tree.h" #include "icing/util/status-macros.h" namespace icing { namespace lib { namespace { std::unique_ptr CreateNaryNode( std::string_view operator_text, std::vector>&& operands) { if (operands.empty()) { return nullptr; } if (operands.size() == 1) { return std::move(operands.at(0)); } return std::make_unique(std::string(operator_text), std::move(operands)); } } // namespace libtextclassifier3::Status Parser::Consume(Lexer::TokenType token_type) { if (!Match(token_type)) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Unable to consume token %d.", static_cast(token_type))); } ++current_token_; return libtextclassifier3::Status::OK; } libtextclassifier3::StatusOr> Parser::ConsumeText() { if (!Match(Lexer::TokenType::TEXT)) { return absl_ports::InvalidArgumentError("Unable to consume token as TEXT."); } auto text_node = std::make_unique(std::move(current_token_->text)); ++current_token_; return text_node; } libtextclassifier3::StatusOr> Parser::ConsumeFunctionName() { if (!Match(Lexer::TokenType::FUNCTION_NAME)) { return absl_ports::InvalidArgumentError( "Unable to consume token as FUNCTION_NAME."); } auto function_name_node = std::make_unique(std::move(current_token_->text)); ++current_token_; return function_name_node; } libtextclassifier3::StatusOr> Parser::ConsumeString() { if (!Match(Lexer::TokenType::STRING)) { return absl_ports::InvalidArgumentError( "Unable to consume token as STRING."); } auto node = std::make_unique(std::move(current_token_->text)); ++current_token_; return node; } libtextclassifier3::StatusOr Parser::ConsumeComparator() { if (!Match(Lexer::TokenType::COMPARATOR)) { return absl_ports::InvalidArgumentError( "Unable to consume token as COMPARATOR."); } std::string comparator = std::move(current_token_->text); ++current_token_; return comparator; } // member // : TEXT (DOT TEXT)* (DOT function)? // ; libtextclassifier3::StatusOr> Parser::ConsumeMember() { ICING_ASSIGN_OR_RETURN(std::unique_ptr text_node, ConsumeText()); std::vector> children; children.push_back(std::move(text_node)); while (Match(Lexer::TokenType::DOT)) { Consume(Lexer::TokenType::DOT); if (MatchFunction()) { ICING_ASSIGN_OR_RETURN(std::unique_ptr function_node, ConsumeFunction()); // Once a function is matched, we should exit the current rule based on // the grammar. return std::make_unique(std::move(children), std::move(function_node)); } ICING_ASSIGN_OR_RETURN(text_node, ConsumeText()); children.push_back(std::move(text_node)); } return std::make_unique(std::move(children), /*function=*/nullptr); } // function // : FUNCTION_NAME LPAREN argList? RPAREN // ; libtextclassifier3::StatusOr> Parser::ConsumeFunction() { ICING_ASSIGN_OR_RETURN(std::unique_ptr function_name, ConsumeFunctionName()); ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN)); std::vector> args; if (Match(Lexer::TokenType::RPAREN)) { // Got empty argument. ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN)); } else { ICING_ASSIGN_OR_RETURN(args, ConsumeArgs()); ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN)); } return std::make_unique(std::move(function_name), std::move(args)); } // comparable // : STRING // | member // | function // ; libtextclassifier3::StatusOr> Parser::ConsumeComparable() { if (Match(Lexer::TokenType::STRING)) { return ConsumeString(); } else if (MatchMember()) { return ConsumeMember(); } // The current token sequence isn't a STRING or member. Therefore, it must be // a function. return ConsumeFunction(); } // composite // : LPAREN expression RPAREN // ; libtextclassifier3::StatusOr> Parser::ConsumeComposite() { ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN)); ICING_ASSIGN_OR_RETURN(std::unique_ptr expression, ConsumeExpression()); ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN)); return expression; } // argList // : expression (COMMA expression)* // ; libtextclassifier3::StatusOr>> Parser::ConsumeArgs() { std::vector> args; ICING_ASSIGN_OR_RETURN(std::unique_ptr arg, ConsumeExpression()); args.push_back(std::move(arg)); while (Match(Lexer::TokenType::COMMA)) { Consume(Lexer::TokenType::COMMA); ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression()); args.push_back(std::move(arg)); } return args; } // restriction // : comparable (COMPARATOR (comparable | composite))? // ; // COMPARATOR will not be produced in Scoring Lexer. libtextclassifier3::StatusOr> Parser::ConsumeRestriction() { ICING_ASSIGN_OR_RETURN(std::unique_ptr comparable, ConsumeComparable()); if (!Match(Lexer::TokenType::COMPARATOR)) { return comparable; } ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator()); std::unique_ptr arg; if (MatchComposite()) { ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite()); } else if (MatchComparable()) { ICING_ASSIGN_OR_RETURN(arg, ConsumeComparable()); } else { return absl_ports::InvalidArgumentError( "ARG: must begin with LPAREN or FIRST(comparable)"); } std::vector> args; args.push_back(std::move(comparable)); args.push_back(std::move(arg)); return std::make_unique(std::move(operator_text), std::move(args)); } // simple // : restriction // | composite // ; libtextclassifier3::StatusOr> Parser::ConsumeSimple() { if (MatchComposite()) { return ConsumeComposite(); } else if (MatchRestriction()) { return ConsumeRestriction(); } return absl_ports::InvalidArgumentError( "SIMPLE: must be a restriction or composite"); } // term // : NOT? simple // | MINUS simple // ; // NOT will not be produced in Scoring Lexer. libtextclassifier3::StatusOr> Parser::ConsumeTerm() { if (!Match(Lexer::TokenType::NOT) && !Match(Lexer::TokenType::MINUS)) { return ConsumeSimple(); } std::string operator_text; if (language_ == Lexer::Language::SCORING) { ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS)); operator_text = "MINUS"; } else { if (Match(Lexer::TokenType::NOT)) { Consume(Lexer::TokenType::NOT); } else { Consume(Lexer::TokenType::MINUS); } operator_text = "NOT"; } ICING_ASSIGN_OR_RETURN(std::unique_ptr simple, ConsumeSimple()); return std::make_unique(operator_text, std::move(simple)); } // factor // : term (OR term)* // ; libtextclassifier3::StatusOr> Parser::ConsumeFactor() { ICING_ASSIGN_OR_RETURN(std::unique_ptr term, ConsumeTerm()); std::vector> terms; terms.push_back(std::move(term)); while (Match(Lexer::TokenType::OR)) { Consume(Lexer::TokenType::OR); ICING_ASSIGN_OR_RETURN(term, ConsumeTerm()); terms.push_back(std::move(term)); } return CreateNaryNode("OR", std::move(terms)); } // sequence // : (factor)+ // ; libtextclassifier3::StatusOr> Parser::ConsumeSequence() { ICING_ASSIGN_OR_RETURN(std::unique_ptr factor, ConsumeFactor()); std::vector> factors; factors.push_back(std::move(factor)); while (MatchFactor()) { ICING_ASSIGN_OR_RETURN(factor, ConsumeFactor()); factors.push_back(std::move(factor)); } return CreateNaryNode("AND", std::move(factors)); } // expression // : sequence (AND sequence)* // ; libtextclassifier3::StatusOr> Parser::ConsumeQueryExpression() { ICING_ASSIGN_OR_RETURN(std::unique_ptr sequence, ConsumeSequence()); std::vector> sequences; sequences.push_back(std::move(sequence)); while (Match(Lexer::TokenType::AND)) { Consume(Lexer::TokenType::AND); ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence()); sequences.push_back(std::move(sequence)); } return CreateNaryNode("AND", std::move(sequences)); } // multExpr // : term ((TIMES | DIV) term)* // ; libtextclassifier3::StatusOr> Parser::ConsumeMultExpr() { ICING_ASSIGN_OR_RETURN(std::unique_ptr node, ConsumeTerm()); std::vector> stack; stack.push_back(std::move(node)); while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) { while (Match(Lexer::TokenType::TIMES)) { Consume(Lexer::TokenType::TIMES); ICING_ASSIGN_OR_RETURN(node, ConsumeTerm()); stack.push_back(std::move(node)); } node = CreateNaryNode("TIMES", std::move(stack)); stack.clear(); stack.push_back(std::move(node)); while (Match(Lexer::TokenType::DIV)) { Consume(Lexer::TokenType::DIV); ICING_ASSIGN_OR_RETURN(node, ConsumeTerm()); stack.push_back(std::move(node)); } node = CreateNaryNode("DIV", std::move(stack)); stack.clear(); stack.push_back(std::move(node)); } return std::move(stack[0]); } // expression // : multExpr ((PLUS | MINUS) multExpr)* // ; libtextclassifier3::StatusOr> Parser::ConsumeScoringExpression() { ICING_ASSIGN_OR_RETURN(std::unique_ptr node, ConsumeMultExpr()); std::vector> stack; stack.push_back(std::move(node)); while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) { while (Match(Lexer::TokenType::PLUS)) { Consume(Lexer::TokenType::PLUS); ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr()); stack.push_back(std::move(node)); } node = CreateNaryNode("PLUS", std::move(stack)); stack.clear(); stack.push_back(std::move(node)); while (Match(Lexer::TokenType::MINUS)) { Consume(Lexer::TokenType::MINUS); ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr()); stack.push_back(std::move(node)); } node = CreateNaryNode("MINUS", std::move(stack)); stack.clear(); stack.push_back(std::move(node)); } return std::move(stack[0]); } libtextclassifier3::StatusOr> Parser::ConsumeExpression() { switch (language_) { case Lexer::Language::QUERY: return ConsumeQueryExpression(); case Lexer::Language::SCORING: return ConsumeScoringExpression(); } } // query // : expression? EOF // ; libtextclassifier3::StatusOr> Parser::ConsumeQuery() { language_ = Lexer::Language::QUERY; std::unique_ptr node; if (current_token_ != lexer_tokens_.end()) { ICING_ASSIGN_OR_RETURN(node, ConsumeExpression()); } if (current_token_ != lexer_tokens_.end()) { return absl_ports::InvalidArgumentError( "Error parsing Query. Must reach EOF after parsing Expression!"); } return node; } // scoring // : expression EOF // ; libtextclassifier3::StatusOr> Parser::ConsumeScoring() { language_ = Lexer::Language::SCORING; std::unique_ptr node; if (current_token_ == lexer_tokens_.end()) { return absl_ports::InvalidArgumentError("Got empty scoring expression!"); } ICING_ASSIGN_OR_RETURN(node, ConsumeExpression()); if (current_token_ != lexer_tokens_.end()) { return absl_ports::InvalidArgumentError( "Error parsing the scoring expression. Must reach EOF after parsing " "Expression!"); } return node; } } // namespace lib } // namespace icing